Coverage for src/artemis_sg/scraper.py: 78%

677 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2024-03-06 08:01 -0800

1import logging 

2import os.path 

3import re 

4import time # for additional sleeps in page load. This is a smell. 

5import urllib.parse 

6 

7from rich.console import Console 

8from rich.text import Text 

9 

10# Selenium 

11from selenium import webdriver 

12from selenium.common.exceptions import ( 

13 ElementClickInterceptedException, 

14 ElementNotInteractableException, 

15 NoSuchElementException, 

16 StaleElementReferenceException, 

17 TimeoutException, 

18) 

19 

20# Chrome 

21from selenium.webdriver.chrome.service import Service as ChromeService 

22from selenium.webdriver.common.by import By 

23from selenium.webdriver.common.keys import Keys as SeleniumKeys 

24from selenium.webdriver.support import expected_conditions as ec 

25from selenium.webdriver.support.ui import WebDriverWait 

26 

27from artemis_sg import spreadsheet, vendor 

28from artemis_sg.config import CFG 

29from artemis_sg.items import Items 

30 

31# Firefox 

32# from selenium.webdriver.firefox.service import Service as FirefoxService 

33 

34MODULE = os.path.splitext(os.path.basename(__file__))[0] 

35console = Console() 

36 

37IMG_FAILOVER_THRESHHOLD = 2 

38 

39 

40class BaseScraper: 

41 """ 

42 Scraper objects know how to scrape base url 

43 """ 

44 

45 def __init__(self, selenium_driver, base_url=None): 

46 self.selenium_driver = selenium_driver 

47 if not base_url: 

48 self.base_url = "" 

49 else: 

50 self.base_url = base_url 

51 self.login_xpath_query = "" 

52 

53 def load_item_page(self, item_number): 

54 return False 

55 

56 def scrape_description(self): 

57 description = "" 

58 return description 

59 

60 def scrape_dimension(self): 

61 dimension = "" 

62 return dimension 

63 

64 def scrape_item_image_urls(self): 

65 urls = [] 

66 return urls 

67 

68 def login(self): 

69 namespace = f"{type(self).__name__}.{self.login.__name__}" 

70 

71 self.delay(2) 

72 input_text = Text( 

73 """ 

74 ******** USER INPUT REQUIRED ******** 

75 Locate the selenium controlled browser 

76 and manually enter your login credentials. 

77 ******** WAITING FOR USER INPUT ******** 

78 """ 

79 ) 

80 input_text.stylize("bold cyan") 

81 console.print(input_text) 

82 try: 

83 WebDriverWait(self.selenium_driver, 

84 CFG["asg"]["scraper"]["login_timeout"]).until( 

85 ec.presence_of_element_located((By.XPATH, self.login_xpath_query)) 

86 ) 

87 success_text = Text( 

88 """ 

89 ******** LOGIN SUCCESSFUL ******** 

90 ******** CONTINUING EXECUTION ******** 

91 """ 

92 ) 

93 success_text.stylize("green") 

94 console.print(success_text) 

95 except (NoSuchElementException, TimeoutException) as e: 

96 logging.error(f"{namespace}: failed to login") 

97 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

98 raise e 

99 

100 def delay(self, secs): 

101 time.sleep(secs) 

102 

103 

104class GJScraper(BaseScraper): 

105 """ 

106 GJScraper objects know how to scrape GJ item pages 

107 """ 

108 

109 def __init__(self, selenium_driver, base_url="https://greatjonesbooks.com"): 

110 super().__init__(selenium_driver, base_url) 

111 self.timeout = 3 

112 self.login_xpath_query = "//a[@href='/account']" 

113 

114 def load_item_page(self, item_number, tries=0): 

115 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}" 

116 

117 # GJ does not maintain session if the links on page are not used 

118 # if not logged in, then build url; else use search facility 

119 try: 

120 self.delay(1) 

121 WebDriverWait(self.selenium_driver, self.timeout).until( 

122 ec.presence_of_element_located( 

123 (By.XPATH, "//a[@href='/account' and text()='Account Summary']") 

124 ) 

125 ) 

126 except (NoSuchElementException, TimeoutException): 

127 start = "/product/" 

128 url = self.base_url + start + item_number 

129 self.selenium_driver.get(url) 

130 return True 

131 try: 

132 search = WebDriverWait(self.selenium_driver, self.timeout).until( 

133 ec.presence_of_element_located((By.XPATH, "//a[@href='/search']")) 

134 ) 

135 search.click() 

136 self.delay(2) 

137 

138 # wait until Publisher list is populated 

139 # by finding sentinel publisher 

140 sentinel = CFG["asg"]["scraper"]["gjscraper"]["sentinel_publisher"] 

141 timeout_bak = self.timeout 

142 self.timeout = 60 

143 WebDriverWait(self.selenium_driver, self.timeout).until( 

144 ec.presence_of_element_located( 

145 (By.XPATH, f"//option[@value='{sentinel}']") 

146 ) 

147 ) 

148 self.timeout = timeout_bak 

149 # then get itemCode field for search 

150 item_field = WebDriverWait(self.selenium_driver, self.timeout).until( 

151 ec.presence_of_element_located((By.XPATH, "//input[@name='itemCode']")) 

152 ) 

153 search_button = self.selenium_driver.find_element( 

154 By.CSS_SELECTOR, ".buttonSet > button:nth-child(1)" 

155 ) 

156 clear_button = self.selenium_driver.find_element( 

157 By.CSS_SELECTOR, ".buttonSet > button:nth-child(2)" 

158 ) 

159 clear_button.click() 

160 item_field.send_keys(item_number) 

161 self.delay(2) 

162 search_button.click() 

163 self.delay(2) 

164 # check for No Results 

165 e = self.selenium_driver.find_element( 

166 By.XPATH, "//div[@class='formBox']/div" 

167 ) 

168 if "No Results" in e.text: 

169 # Do not continue to try 

170 logging.info(f"{namespace}: No Results found for {item_number}") 

171 return False 

172 items = self.selenium_driver.find_elements(By.ID, "product.item_id") 

173 items[0].click() 

174 return True 

175 except (NoSuchElementException, TimeoutException, IndexError): 

176 tries += 1 

177 if tries < self.timeout: 177 ↛ 178line 177 didn't jump to line 178, because the condition on line 177 was never true

178 self.load_item_page(item_number, tries) 

179 else: 

180 logging.info(f"{namespace}: failed item search for {item_number}") 

181 return False 

182 

183 def scrape_description(self): 

184 try: 

185 self.delay(1) 

186 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

187 ec.presence_of_element_located((By.CLASS_NAME, "desc")) 

188 ) 

189 span = elem.find_element(By.CLASS_NAME, "short-comments") 

190 description = span.text 

191 except (NoSuchElementException, TimeoutException): 

192 description = "" 

193 

194 return description 

195 

196 def scrape_item_image_urls(self): 

197 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

198 

199 urls = [] 

200 try: 

201 self.delay(1) 

202 # GJ appears to only have single cover images 

203 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

204 ec.presence_of_element_located((By.CLASS_NAME, "cover")) 

205 ) 

206 img = elem.find_element(By.TAG_NAME, "img") 

207 src = img.get_attribute("src") 

208 if src: 208 ↛ 212line 208 didn't jump to line 212, because the condition on line 208 was never false

209 urls.append(src) 

210 except (NoSuchElementException, TimeoutException) as e: 

211 logging.warning(f"{namespace}: error {e}") 

212 return urls 

213 

214 def load_login_page(self): 

215 # Load search page while logged out in an attempt to get the 

216 # Publishers list to populate when the page is loaded after login. 

217 self.selenium_driver.get(self.base_url + "/search") 

218 self.delay(self.timeout) 

219 login = "/login" 

220 url = self.base_url + login 

221 self.selenium_driver.get(url) 

222 

223 def add_to_cart(self, qty): 

224 # TODO: Can we DRY this up? Some duplication between scrapers 

225 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

226 

227 self.delay(1) 

228 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "on-hand") 

229 m = re.search(r"([0-9]+) in stock", stock_elem.text) 

230 if m: 

231 stock = m.group(1) 

232 if int(stock) < int(qty): 232 ↛ 234line 232 didn't jump to line 234, because the condition on line 232 was never false

233 qty = stock 

234 self.delay(1) 

235 try: 

236 # gather html elements needed 

237 add_div = WebDriverWait(self.selenium_driver, self.timeout).until( 

238 ec.presence_of_element_located((By.CLASS_NAME, "add")) 

239 ) 

240 qty_field = add_div.find_element(By.XPATH, "//input[@name='qty']") 

241 

242 qty_field.clear() 

243 qty_field.send_keys(qty + SeleniumKeys.ENTER) 

244 except (NoSuchElementException, TimeoutException) as e: 

245 logging.warning(f"{namespace}: error {e}") 

246 return 0 

247 return int(qty) 

248 

249 def load_cart_page(self): 

250 # TODO: Can we DRY this up? Some duplication between scrapers 

251 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}" 

252 try: 

253 cart = self.selenium_driver.find_element(By.CLASS_NAME, "cart") 

254 cart.click() 

255 self.delay(1) 

256 cart.click() 

257 self.delay(1) 

258 except Exception as e: 

259 logging.warning(f"{namespace}: error {e}") 

260 return False 

261 return True 

262 

263 def scrape_error_msg(self): 

264 try: 

265 elem = self.selenium_driver.find_element(By.CLASS_NAME, "errorMsg") 

266 msg = elem.text 

267 except NoSuchElementException: 

268 msg = "" 

269 return msg 

270 

271 

272class SDScraper(BaseScraper): 

273 """ 

274 SDScraper objects know how to scrape SD item pages 

275 """ 

276 

277 def __init__(self, selenium_driver, base_url="https://strathearndistribution.com"): 

278 super().__init__(selenium_driver, base_url) 

279 self.timeout = 3 

280 self.login_xpath_query = "//span[text()='My lists']" 

281 

282 def load_login_page(self): 

283 namespace = f"{type(self).__name__}.{self.load_login_page.__name__}" 

284 try: 

285 self.selenium_driver.get(self.base_url) 

286 self.delay(2) 

287 button = self.selenium_driver.find_element(By.ID, "styled_btn") 

288 button.click() 

289 except ( 

290 StaleElementReferenceException, 

291 NoSuchElementException, 

292 TimeoutException, 

293 ) as e: 

294 logging.error(f"{namespace}: failed to load login page") 

295 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

296 raise e 

297 

298 def load_item_page(self, item_number, tries=0): 

299 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}" 

300 try: 

301 self.selenium_driver.get(self.base_url) 

302 self.delay(2) 

303 search = WebDriverWait(self.selenium_driver, self.timeout).until( 

304 ec.presence_of_element_located((By.ID, "search")) 

305 ) 

306 search.send_keys(item_number + SeleniumKeys.ENTER) 

307 self.delay(2) 

308 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

309 ec.presence_of_element_located((By.CLASS_NAME, "listItem")) 

310 ) 

311 self.delay(2) 

312 elem.click() 

313 return True 

314 except ( 

315 StaleElementReferenceException, 

316 NoSuchElementException, 

317 TimeoutException, 

318 ) as e: 

319 tries += 1 

320 if tries < self.timeout: 

321 self.load_item_page(item_number, tries) 

322 else: 

323 logging.warning( 

324 f"{namespace}: Failed to load item page '{item_number}': {e}" 

325 ) 

326 return False 

327 

328 def scrape_description(self): 

329 try: 

330 # rc-* IDs are dynamic, must use classes 

331 elem = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-nav-list") 

332 tab_btn = elem.find_element(By.CLASS_NAME, "ant-tabs-tab-btn") 

333 tab_btn.click() 

334 pane = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-tabpane") 

335 description = pane.text 

336 except NoSuchElementException: 

337 description = "" 

338 

339 return description 

340 

341 def scrape_dimension(self): 

342 try: 

343 dets_xpath ="//div[@class='ant-tabs-tab-btn'][text()='Details']" 

344 btn = self.selenium_driver.find_element(By.XPATH, dets_xpath) 

345 btn.click() 

346 elem = self.selenium_driver.find_element( 

347 By.XPATH, "//div[strong[contains(text(), 'Physical Dimensions:')]]") 

348 t = elem.text 

349 dimension = t.replace("Physical Dimensions:\n", "") 

350 except NoSuchElementException: 

351 dimension = "" 

352 

353 return dimension 

354 

355 def scrape_item_image_urls(self): 

356 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

357 urls = [] 

358 try: 

359 # main only 

360 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

361 ec.presence_of_element_located((By.CLASS_NAME, "full-image")) 

362 ) 

363 src = elem.get_attribute("src") 

364 if src: 364 ↛ 367line 364 didn't jump to line 367, because the condition on line 364 was never false

365 urls.append(src) 

366 # ensure we are seeing the top of the page 

367 html = self.selenium_driver.find_element(By.TAG_NAME, "html") 

368 html.send_keys(SeleniumKeys.PAGE_UP) 

369 # image gallery for additional images 

370 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "gallery-vert") 

371 for elem in elems: 

372 src = elem.get_attribute("src") 

373 if src: 373 ↛ 371line 373 didn't jump to line 371, because the condition on line 373 was never false

374 urls.append(src) 

375 except NoSuchElementException as e: 

376 logging.warning(f"{namespace}: error {e}") 

377 return urls 

378 

379 def add_to_cart(self, qty): 

380 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

381 

382 self.delay(1) 

383 # try:??? 

384 stock_elem = self.selenium_driver.find_element( 

385 By.XPATH, "//span[contains(text(), 'in stock')]" 

386 ) 

387 m = re.search(r"([0-9]+) in stock", stock_elem.get_attribute("innerHTML")) 

388 if m: 

389 stock = m.group(1) 

390 if int(stock) < int(qty): 390 ↛ 392line 390 didn't jump to line 392, because the condition on line 390 was never false

391 qty = stock 

392 self.delay(1) 

393 try: 

394 # gather html elements needed 

395 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "ant-btn-primary") 

396 button = None 

397 for e in elems: 397 ↛ 401line 397 didn't jump to line 401, because the loop on line 397 didn't complete

398 if "Add to cart" in e.text: 398 ↛ 397line 398 didn't jump to line 397, because the condition on line 398 was never false

399 button = e 

400 break 

401 qty_field = self.selenium_driver.find_element( 

402 By.XPATH, 

403 ( 

404 "//input[@class='ant-input' and @type='text' " 

405 "and not(ancestor::div[contains(@class, '-block')])]" 

406 ), 

407 ) 

408 # the qty field must be clicked to highlight amount. Clearing doesn't work 

409 qty_field.click() 

410 qty_field.send_keys(qty) 

411 button.click() 

412 except Exception as e: 

413 logging.warning(f"{namespace}: error {e}") 

414 return 0 

415 return int(qty) 

416 

417 def load_cart_page(self): 

418 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}" 

419 try: 

420 cart = "/checkout/cart" 

421 url = self.base_url + cart 

422 self.selenium_driver.get(url) 

423 self.delay(1) 

424 return True 

425 except Exception as e: 

426 logging.warning(f"{namespace}: error {e}") 

427 return False 

428 

429 

430class TBScraper(BaseScraper): 

431 """ 

432 TBScraper objects know how to scrape TB item pages 

433 """ 

434 

435 def __init__(self, selenium_driver, base_url="https://texasbookman.com/"): 

436 super().__init__(selenium_driver, base_url) 

437 self.timeout = 3 

438 self.login_xpath_query = "//a[@href='/admin']" 

439 

440 def load_item_page(self, item_number): 

441 start = "p/" 

442 url = self.base_url + start + item_number 

443 self.selenium_driver.get(url) 

444 return True 

445 

446 def scrape_description(self): 

447 try: 

448 elem = self.selenium_driver.find_element( 

449 By.CLASS_NAME, "variant-description" 

450 ) 

451 text = elem.text 

452 description = text.replace("NO AMAZON SALES\n\n", "") 

453 except NoSuchElementException: 

454 description = "" 

455 

456 return description 

457 

458 def scrape_dimension(self): 

459 try: 

460 elem = self.selenium_driver.find_element( 

461 By.CLASS_NAME, "full-description" 

462 ) 

463 m = re.search(r"(Size:.+)\n", elem.text) 

464 dimension = m.group(1).replace("Size:", "").strip() 

465 except (NoSuchElementException, AttributeError): 

466 dimension = "" 

467 

468 return dimension 

469 

470 def scrape_item_image_urls(self): 

471 urls = [] 

472 try: 

473 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

474 ec.presence_of_element_located((By.CLASS_NAME, "a-left")) 

475 ) 

476 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture-thumbs") 

477 left = elem.find_element(By.CLASS_NAME, "a-left") 

478 left.click() 

479 while True: 

480 self.delay(2) 

481 thumb = self._get_thumb_from_slimbox() 

482 if thumb: 

483 urls.append(thumb) 

484 next_link = WebDriverWait(self.selenium_driver, self.timeout).until( 

485 ec.presence_of_element_located((By.ID, "lbNextLink")) 

486 ) 

487 self.delay(2) 

488 next_link.click() 

489 except ( 

490 NoSuchElementException, 

491 ElementNotInteractableException, 

492 TimeoutException, 

493 ): 

494 try: 

495 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture") 

496 img = elem.find_element(By.TAG_NAME, "img") 

497 thumb = img.get_attribute("src") 

498 urls.append(thumb) 

499 except NoSuchElementException: 

500 pass 

501 

502 return urls 

503 

504 def _get_thumb_from_slimbox(self): 

505 timeout = 3 

506 thumb = None 

507 try: 

508 img_div = WebDriverWait(self.selenium_driver, timeout).until( 

509 ec.presence_of_element_located((By.ID, "lbImage")) 

510 ) 

511 style = img_div.get_attribute("style") 

512 m = re.search('"(.*)"', style) 

513 if m: 513 ↛ 518line 513 didn't jump to line 518, because the condition on line 513 was never false

514 thumb = m.group(1) 

515 except (NoSuchElementException, TimeoutException): 

516 pass 

517 

518 return thumb 

519 

520 def load_login_page(self): 

521 login = "login" 

522 url = self.base_url + login 

523 self.selenium_driver.get(url) 

524 

525 def impersonate(self, email): 

526 namespace = f"{type(self).__name__}.{self.impersonate.__name__}" 

527 

528 # Go to /Admin/Customer/List 

529 customers = "/Admin/Customer/List" 

530 url = self.base_url + customers 

531 self.selenium_driver.get(url) 

532 self.delay(1) 

533 try: 

534 # search for email 

535 search_email = WebDriverWait(self.selenium_driver, self.timeout).until( 

536 ec.presence_of_element_located((By.ID, "SearchEmail")) 

537 ) 

538 search_email.clear() 

539 search_email.send_keys(email + SeleniumKeys.ENTER) 

540 # Get customer link associated with email 

541 email_xpath = ( 

542 f"//div[@id='customers-grid']/table/tbody/tr/td/a[text()='{email}']" 

543 ) 

544 customer_link = WebDriverWait(self.selenium_driver, self.timeout).until( 

545 ec.presence_of_element_located((By.XPATH, email_xpath)) 

546 ) 

547 links = self.selenium_driver.find_elements(By.XPATH, email_xpath) 

548 # Bail if multiple customer records for given email. 

549 if len(links) > 1: 

550 logging.error( 

551 f"{namespace}: Found multiple customer records for email " 

552 f"'{email}' to impersonate" 

553 ) 

554 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

555 raise Exception 

556 customer_link.click() 

557 # click "Place Order (impersonate)" 

558 impersonate = WebDriverWait(self.selenium_driver, self.timeout).until( 

559 ec.presence_of_element_located( 

560 (By.XPATH, "//a[text()='Place order (Impersonate)']") 

561 ) 

562 ) 

563 impersonate.click() 

564 # click "Place Order" button 

565 button = WebDriverWait(self.selenium_driver, self.timeout).until( 

566 ec.presence_of_element_located( 

567 (By.XPATH, "//input[@name='impersonate']") 

568 ) 

569 ) 

570 button.click() 

571 self.delay(1) 

572 WebDriverWait(self.selenium_driver, self.timeout).until( 

573 ec.presence_of_element_located((By.CLASS_NAME, "finish-impersonation")) 

574 ) 

575 except (NoSuchElementException, TimeoutException) as e: 

576 logging.error(f"{namespace}: failed to impersonate") 

577 logging.error(f"{namespace}: Cannot proceed. Exiting.") 

578 raise e 

579 return True 

580 

581 def add_to_cart(self, qty): 

582 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}" 

583 

584 qty = int(qty) 

585 self.delay(1) 

586 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "stock") 

587 m = re.search(r"Availability: ([0-9]+) in stock", stock_elem.text) 

588 if m: 588 ↛ 593line 588 didn't jump to line 593, because the condition on line 588 was never false

589 stock = m.group(1) 

590 stock = int(stock) 

591 if stock < qty: 

592 qty = stock 

593 try: 

594 # gather html elements needed 

595 qty_field = WebDriverWait(self.selenium_driver, self.timeout).until( 

596 ec.presence_of_element_located((By.CLASS_NAME, "qty-input")) 

597 ) 

598 button = self.selenium_driver.find_element( 

599 By.CLASS_NAME, "add-to-cart-button" 

600 ) 

601 qty_field.clear() 

602 # ENTERing out of the qty_field DOES NOT add to cart. 

603 # The button must be clicked instead. 

604 qty_field.send_keys(qty) 

605 button.click() 

606 self.delay(1) 

607 except Exception as e: 

608 logging.warning(f"{namespace}: error {e}") 

609 return 0 

610 return qty 

611 

612 def load_cart_page(self): 

613 cart = "cart" 

614 url = self.base_url + cart 

615 self.selenium_driver.get(url) 

616 return True 

617 

618 def search_item_num(self, search): 

619 namespace = f"{type(self).__name__}.{self.search_item_num.__name__}" 

620 

621 item_num = "" 

622 search = urllib.parse.quote_plus(search) 

623 url = self.base_url + "search?q=" + search 

624 self.selenium_driver.get(url) 

625 self.delay(2) 

626 timeout_bak = self.timeout 

627 self.timeout = 120 

628 WebDriverWait(self.selenium_driver, self.timeout).until( 

629 ec.presence_of_element_located((By.CLASS_NAME, "search-results")) 

630 ) 

631 self.timeout = timeout_bak 

632 links = self.selenium_driver.find_elements( 

633 By.XPATH, "//div[@class='search-results']//a[contains(@href, '/p/')]" 

634 ) 

635 if links: 635 ↛ 643line 635 didn't jump to line 643, because the condition on line 635 was never false

636 item_urls = [x.get_attribute("href") for x in links] 

637 for item_url in item_urls: 

638 m = re.search(r"\/p\/([0-9]+)\/(?!uk-)", item_url) 

639 if m: 

640 item_num = m.group(1) 

641 break 

642 else: 

643 logging.warning(f"{namespace}: Failed to find item using q='{search}'") 

644 return item_num 

645 

646 

647class AmznScraper(BaseScraper): 

648 """ 

649 AmznScraper objects know how to scrape amazon item pages 

650 """ 

651 

652 def __init__(self, selenium_driver, base_url="https://www.amazon.com/"): 

653 super().__init__(selenium_driver, base_url) 

654 self.timeout = 1 

655 self.captcha_link = self.base_url + "/errors/validateCaptcha" 

656 

657 def solve_captcha(self, link=None): 

658 from amazoncaptcha import AmazonCaptcha 

659 

660 if not link: 660 ↛ 661line 660 didn't jump to line 661, because the condition on line 660 was never true

661 link = self.captcha_link 

662 try: 

663 self.selenium_driver.get(link) 

664 captcha = AmazonCaptcha.fromdriver(self.selenium_driver) 

665 solution = captcha.solve() 

666 if solution.lower() == "not solved": 

667 raise(NoSuchElementException) 

668 return solution 

669 except (NoSuchElementException, TimeoutException): 

670 return "" 

671 

672 def enter_captcha(self, solution): 

673 namespace = ( 

674 f"{type(self).__name__}.{self.enter_captcha.__name__}" 

675 ) 

676 if solution: 

677 elem = self.selenium_driver.find_element(By.ID, "captchacharacters") 

678 elem.send_keys(solution + SeleniumKeys.ENTER) 

679 else: 

680 input_text = Text( 

681 """ 

682 ******** USER INPUT REQUIRED ******** 

683 Locate the selenium controlled browser 

684 and manually enter the requested CAPTCHA characters. 

685 ******** WAITING FOR USER INPUT ******** 

686 """ 

687 ) 

688 input_text.stylize("bold cyan") 

689 console.print(input_text) 

690 try: 

691 timeout_bak = self.timeout 

692 self.timeout = self.timeout * 100 

693 WebDriverWait(self.selenium_driver, self.timeout).until( 

694 ec.presence_of_element_located( 

695 (By.XPATH, "//a[@href='/ref=nav_logo']")) 

696 ) 

697 self.timeout = timeout_bak 

698 success_text = Text( 

699 """ 

700 ******** CAPTCHA SUCCESSFUL ******** 

701 ******** CONTINUING EXECUTION ******** 

702 """ 

703 ) 

704 success_text.stylize("green") 

705 console.print(success_text) 

706 except (NoSuchElementException, TimeoutException): 

707 logging.error(f"{namespace}: failed CAPTCHA") 

708 

709 def load_item_page(self, item_number): 

710 start = "dp/" 

711 url = self.base_url + start + item_number 

712 self.selenium_driver.get(url) 

713 return True 

714 

715 def scrape_description(self): 

716 description = "" 

717 description = self._scrape_amazon_editorial_review() 

718 if not description: 

719 description = self._scrape_amazon_description() 

720 

721 return description 

722 

723 def scrape_dimension(self): 

724 dimension = "" 

725 try: 

726 xpath = "//span/span[contains(text(), 'Dimensions')]//following::span" 

727 elem = self.selenium_driver.find_element(By.XPATH, xpath) 

728 dimension = elem.get_attribute("innerHTML") 

729 except NoSuchElementException: 

730 dimension = "" 

731 return dimension 

732 

733 def _scrape_amazon_editorial_review(self): 

734 descr = "" 

735 try: 

736 elem = self.selenium_driver.find_element( 

737 By.ID, "editorialReviews_feature_div" 

738 ) 

739 text = elem.text 

740 descr_lines = re.split("^.*\\n.*\\n", text) # trim off first two lines 

741 descr = descr_lines[-1] 

742 except NoSuchElementException: 

743 descr = "" 

744 

745 return descr 

746 

747 def _scrape_amazon_description(self): 

748 descr = "" 

749 try: 

750 elem = self.selenium_driver.find_element( 

751 By.ID, "bookDescription_feature_div" 

752 ) 

753 # read_more = elem.find_element(By.CLASS_NAME, 'a-expander-prompt') 

754 # read_more.click() 

755 descr = elem.text 

756 except NoSuchElementException: 

757 descr = "" 

758 

759 return descr 

760 

761 def get_span_type_thumb_id_prefix(self): 

762 """Get span_type and thumb_id_prefix from amazon images widget.""" 

763 namespace = ( 

764 f"{type(self).__name__}.{self.get_span_type_thumb_id_prefix.__name__}" 

765 ) 

766 span_type = None 

767 thumb_id_prefix = None 

768 try: 

769 span = WebDriverWait(self.selenium_driver, self.timeout).until( 

770 ec.presence_of_element_located((By.ID, "imgThumbs")) 

771 ) 

772 span_type = "imgThumbs" 

773 except (NoSuchElementException, TimeoutException): 

774 logging.info(f"{namespace}: No imgThumbs id, trying imgTagWrapperID") 

775 try: 

776 span = WebDriverWait(self.selenium_driver, self.timeout).until( 

777 ec.presence_of_element_located((By.ID, "imgTagWrapperId")) 

778 ) 

779 span_type = "imgTagWrapperId" 

780 except (NoSuchElementException, TimeoutException): 

781 logging.info(f"{namespace}: No imgTagWrapperId id") 

782 logging.info(f"{namespace}: Returning empty urls list") 

783 return (span_type, thumb_id_prefix) 

784 

785 if span_type == "imgThumbs": 785 ↛ 789line 785 didn't jump to line 789, because the condition on line 785 was never false

786 link = span.find_element(By.CLASS_NAME, "a-link-normal") 

787 thumb_id_prefix = "ig-thumb-" 

788 else: 

789 link = span 

790 thumb_id_prefix = "ivImage_" 

791 try: 

792 link.click() 

793 except ElementClickInterceptedException: 

794 logging.info(f"{namespace}: Failed to click images widget") 

795 logging.info(f"{namespace}: Returning empty urls list") 

796 return (span_type, thumb_id_prefix) 

797 return (span_type, thumb_id_prefix) 

798 

799 def scrape_item_image_urls(self): 

800 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}" 

801 counter = 0 

802 urls = [] 

803 

804 span_type, thumb_id_prefix = self.get_span_type_thumb_id_prefix() 

805 if thumb_id_prefix: 805 ↛ 835line 805 didn't jump to line 835, because the condition on line 805 was never false

806 logging.debug(f"{namespace}: Clicked images widget") 

807 # get image urls 

808 while True: 

809 try: 

810 thumb = "" 

811 xpath = f"//*[@id='{thumb_id_prefix}{counter}']" 

812 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

813 ec.presence_of_element_located((By.XPATH, xpath)) 

814 ) 

815 if span_type == "imgThumbs": 815 ↛ 817line 815 didn't jump to line 817, because the condition on line 815 was never false

816 thumb = elem.get_attribute("src") 

817 if span_type == "imgTagWrapperId": 817 ↛ 818line 817 didn't jump to line 818, because the condition on line 817 was never true

818 inner_elem = elem.find_element(By.CLASS_NAME, "ivThumbImage") 

819 style = inner_elem.get_attribute("style") 

820 m = re.search('"(.*)"', style) 

821 if m: 

822 thumb = m.group(1) 

823 sub, suff = os.path.splitext(thumb) 

824 indx = sub.find("._") 

825 url = sub[:indx] + suff 

826 if url: 826 ↛ 828line 826 didn't jump to line 828, because the condition on line 826 was never false

827 urls.append(url) 

828 logging.debug(f"{namespace}: Thumbnail src is {thumb}") 

829 logging.debug(f"{namespace}: Full size URL is %r" % url) 

830 counter += 1 

831 except (NoSuchElementException, TimeoutException): 

832 break 

833 # amazon adds stupid human holding book images 

834 # remove this 

835 if len(urls) > 1: 835 ↛ 838line 835 didn't jump to line 838, because the condition on line 835 was never false

836 urls.pop() 

837 

838 return urls 

839 

840 

841class AmznUkScraper(AmznScraper): 

842 """ 

843 AmznUkScraper objects know how to scrape amazon.co.uk item pages 

844 """ 

845 

846 def __init__(self, selenium_driver, base_url="https://www.amazon.co.uk/"): 

847 super().__init__(selenium_driver, base_url) 

848 

849 def decline_cookies(self): 

850 try: 

851 decline_button = self.selenium_driver.find_element( 

852 By.ID, "sp-cc-rejectall-link" 

853 ) 

854 decline_button.click() 

855 self.delay(2) 

856 return True 

857 except (NoSuchElementException, TimeoutException): 

858 return False 

859 

860 def load_item_page(self, isbn): 

861 # Search by ISBN 

862 start = "s?isbn=" 

863 url = self.base_url + start + isbn 

864 self.selenium_driver.get(url) 

865 self.decline_cookies() 

866 # Look for results 

867 elem = WebDriverWait(self.selenium_driver, self.timeout).until( 

868 ec.presence_of_element_located((By.CLASS_NAME, "s-result-list")) 

869 ) 

870 # Get ASIN from first result 

871 inner_e = elem.find_element(By.CLASS_NAME, "a-link-normal") 

872 link = inner_e.get_attribute("href") 

873 m = re.search(r"\/dp\/([0-9A-Z]+)/", link) 

874 asin = m.group(1) if m else "" 

875 # Load ASIN page 

876 if asin: 

877 start = "dp/" 

878 url = self.base_url + start + asin 

879 self.selenium_driver.get(url) 

880 return True 

881 return False 

882 

883 

884############################################################################## 

885# utility functions 

886############################################################################## 

887def get_headless_driver(): 

888 return get_driver("--headless=new") 

889 

890 

891def get_driver(option_args: str = ""): 

892 """Creates a new instance of the chrome driver. 

893 

894 :param option_args: 

895 Option arguments to pass to the driver 

896 :returns: selenium.webdriver object 

897 """ 

898 namespace = f"{MODULE}.{get_driver.__name__}" 

899 service = ChromeService() 

900 options = webdriver.ChromeOptions() 

901 if option_args: 901 ↛ 902line 901 didn't jump to line 902, because the condition on line 901 was never true

902 options.add_argument(option_args) 

903 logging.info(f"{namespace}: Setting webdriver option to '{option_args}'.") 

904 driver = webdriver.Chrome(service=service, options=options) 

905 return driver 

906 

907 

908def scrape_item(scrapr, item_id, description="", dimension="", image_urls=None): 

909 if image_urls is None: 

910 image_urls = [] 

911 namespace = f"{MODULE}.{scrape_item.__name__}" 

912 scrapr.load_item_page(item_id) 

913 logging.info( 

914 f"{namespace}: Getting item image urls via {scrapr.__class__.__name__}" 

915 ) 

916 l_image_urls = scrapr.scrape_item_image_urls() 

917 if image_urls and len(l_image_urls) > 1: 917 ↛ 918line 917 didn't jump to line 918, because the condition on line 917 was never true

918 l_image_urls.pop(0) 

919 image_urls = image_urls + l_image_urls 

920 logging.info(" URLs: %r" % image_urls) 

921 if image_urls and not description: 921 ↛ 927line 921 didn't jump to line 927, because the condition on line 921 was never false

922 logging.info( 

923 f"{namespace}: Getting description via {scrapr.__class__.__name__}" 

924 ) 

925 description = scrapr.scrape_description() 

926 logging.info(" Description: %r" % description[:140]) 

927 if image_urls and not dimension: 927 ↛ 933line 927 didn't jump to line 933, because the condition on line 927 was never false

928 logging.info( 

929 f"{namespace}: Getting dimension via {scrapr.__class__.__name__}" 

930 ) 

931 dimension = scrapr.scrape_dimension() 

932 logging.info(" Dimension: %r" % dimension[:140]) 

933 return description, dimension, image_urls 

934 

935 

936def get_failover_scraper_item_id(driver, vendr, item): 

937 namespace = f"{MODULE}.{get_failover_scraper_item_id.__name__}" 

938 failover_scrapr = None 

939 item_id = item.isbn 

940 if vendr.vendor_code == "tb": 

941 try: 

942 url = item.data["LINK"] 

943 m = re.search(r"\/([0-9]+)\/", url) 

944 if m: 944 ↛ 948line 944 didn't jump to line 948, because the condition on line 944 was never false

945 item_id = m.group(1) 

946 except KeyError: 

947 logging.error(f"{namespace}: No link found in item") 

948 if vendr.failover_scraper in globals(): 

949 failover_scrapr= globals()[vendr.failover_scraper](driver) 

950 return failover_scrapr, item_id 

951 

952 

953def main(vendor_code, sheet_id, worksheet, scraped_items_db): # noqa: C901 

954 namespace = f"{MODULE}.{main.__name__}" 

955 # get vendor info from database 

956 logging.debug(f"{namespace}: Instantiate vendor.") 

957 vendr = vendor.Vendor(vendor_code) 

958 vendr.set_vendor_data() 

959 

960 sheet_data = spreadsheet.get_sheet_data(sheet_id, worksheet) 

961 

962 sheet_keys = [x for x in sheet_data.pop(0) if x] # filter out None 

963 items_obj = Items(sheet_keys, sheet_data, vendr.isbn_key) 

964 items_obj.load_scraped_data(scraped_items_db) 

965 driver = None 

966 prime_scrapr = None 

967 failover_scrapr = None 

968 for item in items_obj: 

969 if not item.isbn: 969 ↛ 970line 969 didn't jump to line 970, because the condition on line 969 was never true

970 if "TBCODE" in item.data: 

971 item.isbn = item.data["TBCODE"] 

972 if not item.isbn: 

973 logging.info(f"{namespace}: No isbn for item, skipping lookup") 

974 continue 

975 description = "" 

976 dimension = "" 

977 image_urls = [] 

978 # if scraped_item image_urls is not empty: 

979 # skip scraped_item 

980 logging.info(f"{namespace}: Searching for {item.isbn} ...") 

981 if item.image_urls != []: 981 ↛ 982line 981 didn't jump to line 982, because the condition on line 981 was never true

982 logging.info(f"{namespace}: {item.isbn} found in database, skipping") 

983 continue 

984 

985 if not driver and not prime_scrapr: 985 ↛ 996line 985 didn't jump to line 996, because the condition on line 985 was never false

986 logging.info(f"{namespace}: Opening browser...") 

987 if CFG["asg"]["scraper"]["headless"]: 987 ↛ 988line 987 didn't jump to line 988, because the condition on line 987 was never true

988 driver = get_headless_driver() 

989 else: 

990 driver = get_driver() 

991 prime_scrapr = AmznScraper(driver) 

992 solution = prime_scrapr.solve_captcha() 

993 prime_scrapr.enter_captcha(solution) 

994 

995 

996 logging.info(f"{namespace}: No scraped data currently: {item.isbn}") 

997 description, dimension, image_urls = scrape_item( 

998 prime_scrapr, item.isbn10, description, dimension, image_urls 

999 ) 

1000 if len(image_urls) < IMG_FAILOVER_THRESHHOLD: 1000 ↛ 1001line 1000 didn't jump to line 1001, because the condition on line 1000 was never true

1001 failover_scrapr, item_id = get_failover_scraper_item_id( 

1002 driver, vendr, item 

1003 ) 

1004 if failover_scrapr: 

1005 if isinstance(failover_scrapr, AmznScraper): 

1006 solution = failover_scrapr.solve_captcha() 

1007 failover_scrapr.enter_captcha(solution) 

1008 description, dimension, image_urls = scrape_item( 

1009 failover_scrapr, item_id, description, dimension, image_urls 

1010 ) 

1011 

1012 item.data["DESCRIPTION"] = description 

1013 item.data["DIMENSION"] = dimension 

1014 item.image_urls = image_urls 

1015 

1016 logging.info(f"{namespace}: Saving scraped item data") 

1017 items_obj.save_scraped_data(scraped_items_db) 

1018 if driver: 1018 ↛ exitline 1018 didn't return from function 'main', because the condition on line 1018 was never false

1019 logging.info(f"{namespace}: Closing browser...") 

1020 driver.quit()