Coverage for src/artemis_sg/scraper.py: 78%
677 statements
« prev ^ index » next coverage.py v7.3.1, created at 2024-03-06 08:01 -0800
« prev ^ index » next coverage.py v7.3.1, created at 2024-03-06 08:01 -0800
1import logging
2import os.path
3import re
4import time # for additional sleeps in page load. This is a smell.
5import urllib.parse
7from rich.console import Console
8from rich.text import Text
10# Selenium
11from selenium import webdriver
12from selenium.common.exceptions import (
13 ElementClickInterceptedException,
14 ElementNotInteractableException,
15 NoSuchElementException,
16 StaleElementReferenceException,
17 TimeoutException,
18)
20# Chrome
21from selenium.webdriver.chrome.service import Service as ChromeService
22from selenium.webdriver.common.by import By
23from selenium.webdriver.common.keys import Keys as SeleniumKeys
24from selenium.webdriver.support import expected_conditions as ec
25from selenium.webdriver.support.ui import WebDriverWait
27from artemis_sg import spreadsheet, vendor
28from artemis_sg.config import CFG
29from artemis_sg.items import Items
31# Firefox
32# from selenium.webdriver.firefox.service import Service as FirefoxService
34MODULE = os.path.splitext(os.path.basename(__file__))[0]
35console = Console()
37IMG_FAILOVER_THRESHHOLD = 2
40class BaseScraper:
41 """
42 Scraper objects know how to scrape base url
43 """
45 def __init__(self, selenium_driver, base_url=None):
46 self.selenium_driver = selenium_driver
47 if not base_url:
48 self.base_url = ""
49 else:
50 self.base_url = base_url
51 self.login_xpath_query = ""
53 def load_item_page(self, item_number):
54 return False
56 def scrape_description(self):
57 description = ""
58 return description
60 def scrape_dimension(self):
61 dimension = ""
62 return dimension
64 def scrape_item_image_urls(self):
65 urls = []
66 return urls
68 def login(self):
69 namespace = f"{type(self).__name__}.{self.login.__name__}"
71 self.delay(2)
72 input_text = Text(
73 """
74 ******** USER INPUT REQUIRED ********
75 Locate the selenium controlled browser
76 and manually enter your login credentials.
77 ******** WAITING FOR USER INPUT ********
78 """
79 )
80 input_text.stylize("bold cyan")
81 console.print(input_text)
82 try:
83 WebDriverWait(self.selenium_driver,
84 CFG["asg"]["scraper"]["login_timeout"]).until(
85 ec.presence_of_element_located((By.XPATH, self.login_xpath_query))
86 )
87 success_text = Text(
88 """
89 ******** LOGIN SUCCESSFUL ********
90 ******** CONTINUING EXECUTION ********
91 """
92 )
93 success_text.stylize("green")
94 console.print(success_text)
95 except (NoSuchElementException, TimeoutException) as e:
96 logging.error(f"{namespace}: failed to login")
97 logging.error(f"{namespace}: Cannot proceed. Exiting.")
98 raise e
100 def delay(self, secs):
101 time.sleep(secs)
104class GJScraper(BaseScraper):
105 """
106 GJScraper objects know how to scrape GJ item pages
107 """
109 def __init__(self, selenium_driver, base_url="https://greatjonesbooks.com"):
110 super().__init__(selenium_driver, base_url)
111 self.timeout = 3
112 self.login_xpath_query = "//a[@href='/account']"
114 def load_item_page(self, item_number, tries=0):
115 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}"
117 # GJ does not maintain session if the links on page are not used
118 # if not logged in, then build url; else use search facility
119 try:
120 self.delay(1)
121 WebDriverWait(self.selenium_driver, self.timeout).until(
122 ec.presence_of_element_located(
123 (By.XPATH, "//a[@href='/account' and text()='Account Summary']")
124 )
125 )
126 except (NoSuchElementException, TimeoutException):
127 start = "/product/"
128 url = self.base_url + start + item_number
129 self.selenium_driver.get(url)
130 return True
131 try:
132 search = WebDriverWait(self.selenium_driver, self.timeout).until(
133 ec.presence_of_element_located((By.XPATH, "//a[@href='/search']"))
134 )
135 search.click()
136 self.delay(2)
138 # wait until Publisher list is populated
139 # by finding sentinel publisher
140 sentinel = CFG["asg"]["scraper"]["gjscraper"]["sentinel_publisher"]
141 timeout_bak = self.timeout
142 self.timeout = 60
143 WebDriverWait(self.selenium_driver, self.timeout).until(
144 ec.presence_of_element_located(
145 (By.XPATH, f"//option[@value='{sentinel}']")
146 )
147 )
148 self.timeout = timeout_bak
149 # then get itemCode field for search
150 item_field = WebDriverWait(self.selenium_driver, self.timeout).until(
151 ec.presence_of_element_located((By.XPATH, "//input[@name='itemCode']"))
152 )
153 search_button = self.selenium_driver.find_element(
154 By.CSS_SELECTOR, ".buttonSet > button:nth-child(1)"
155 )
156 clear_button = self.selenium_driver.find_element(
157 By.CSS_SELECTOR, ".buttonSet > button:nth-child(2)"
158 )
159 clear_button.click()
160 item_field.send_keys(item_number)
161 self.delay(2)
162 search_button.click()
163 self.delay(2)
164 # check for No Results
165 e = self.selenium_driver.find_element(
166 By.XPATH, "//div[@class='formBox']/div"
167 )
168 if "No Results" in e.text:
169 # Do not continue to try
170 logging.info(f"{namespace}: No Results found for {item_number}")
171 return False
172 items = self.selenium_driver.find_elements(By.ID, "product.item_id")
173 items[0].click()
174 return True
175 except (NoSuchElementException, TimeoutException, IndexError):
176 tries += 1
177 if tries < self.timeout: 177 ↛ 178line 177 didn't jump to line 178, because the condition on line 177 was never true
178 self.load_item_page(item_number, tries)
179 else:
180 logging.info(f"{namespace}: failed item search for {item_number}")
181 return False
183 def scrape_description(self):
184 try:
185 self.delay(1)
186 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
187 ec.presence_of_element_located((By.CLASS_NAME, "desc"))
188 )
189 span = elem.find_element(By.CLASS_NAME, "short-comments")
190 description = span.text
191 except (NoSuchElementException, TimeoutException):
192 description = ""
194 return description
196 def scrape_item_image_urls(self):
197 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
199 urls = []
200 try:
201 self.delay(1)
202 # GJ appears to only have single cover images
203 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
204 ec.presence_of_element_located((By.CLASS_NAME, "cover"))
205 )
206 img = elem.find_element(By.TAG_NAME, "img")
207 src = img.get_attribute("src")
208 if src: 208 ↛ 212line 208 didn't jump to line 212, because the condition on line 208 was never false
209 urls.append(src)
210 except (NoSuchElementException, TimeoutException) as e:
211 logging.warning(f"{namespace}: error {e}")
212 return urls
214 def load_login_page(self):
215 # Load search page while logged out in an attempt to get the
216 # Publishers list to populate when the page is loaded after login.
217 self.selenium_driver.get(self.base_url + "/search")
218 self.delay(self.timeout)
219 login = "/login"
220 url = self.base_url + login
221 self.selenium_driver.get(url)
223 def add_to_cart(self, qty):
224 # TODO: Can we DRY this up? Some duplication between scrapers
225 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
227 self.delay(1)
228 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "on-hand")
229 m = re.search(r"([0-9]+) in stock", stock_elem.text)
230 if m:
231 stock = m.group(1)
232 if int(stock) < int(qty): 232 ↛ 234line 232 didn't jump to line 234, because the condition on line 232 was never false
233 qty = stock
234 self.delay(1)
235 try:
236 # gather html elements needed
237 add_div = WebDriverWait(self.selenium_driver, self.timeout).until(
238 ec.presence_of_element_located((By.CLASS_NAME, "add"))
239 )
240 qty_field = add_div.find_element(By.XPATH, "//input[@name='qty']")
242 qty_field.clear()
243 qty_field.send_keys(qty + SeleniumKeys.ENTER)
244 except (NoSuchElementException, TimeoutException) as e:
245 logging.warning(f"{namespace}: error {e}")
246 return 0
247 return int(qty)
249 def load_cart_page(self):
250 # TODO: Can we DRY this up? Some duplication between scrapers
251 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}"
252 try:
253 cart = self.selenium_driver.find_element(By.CLASS_NAME, "cart")
254 cart.click()
255 self.delay(1)
256 cart.click()
257 self.delay(1)
258 except Exception as e:
259 logging.warning(f"{namespace}: error {e}")
260 return False
261 return True
263 def scrape_error_msg(self):
264 try:
265 elem = self.selenium_driver.find_element(By.CLASS_NAME, "errorMsg")
266 msg = elem.text
267 except NoSuchElementException:
268 msg = ""
269 return msg
272class SDScraper(BaseScraper):
273 """
274 SDScraper objects know how to scrape SD item pages
275 """
277 def __init__(self, selenium_driver, base_url="https://strathearndistribution.com"):
278 super().__init__(selenium_driver, base_url)
279 self.timeout = 3
280 self.login_xpath_query = "//span[text()='My lists']"
282 def load_login_page(self):
283 namespace = f"{type(self).__name__}.{self.load_login_page.__name__}"
284 try:
285 self.selenium_driver.get(self.base_url)
286 self.delay(2)
287 button = self.selenium_driver.find_element(By.ID, "styled_btn")
288 button.click()
289 except (
290 StaleElementReferenceException,
291 NoSuchElementException,
292 TimeoutException,
293 ) as e:
294 logging.error(f"{namespace}: failed to load login page")
295 logging.error(f"{namespace}: Cannot proceed. Exiting.")
296 raise e
298 def load_item_page(self, item_number, tries=0):
299 namespace = f"{type(self).__name__}.{self.load_item_page.__name__}"
300 try:
301 self.selenium_driver.get(self.base_url)
302 self.delay(2)
303 search = WebDriverWait(self.selenium_driver, self.timeout).until(
304 ec.presence_of_element_located((By.ID, "search"))
305 )
306 search.send_keys(item_number + SeleniumKeys.ENTER)
307 self.delay(2)
308 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
309 ec.presence_of_element_located((By.CLASS_NAME, "listItem"))
310 )
311 self.delay(2)
312 elem.click()
313 return True
314 except (
315 StaleElementReferenceException,
316 NoSuchElementException,
317 TimeoutException,
318 ) as e:
319 tries += 1
320 if tries < self.timeout:
321 self.load_item_page(item_number, tries)
322 else:
323 logging.warning(
324 f"{namespace}: Failed to load item page '{item_number}': {e}"
325 )
326 return False
328 def scrape_description(self):
329 try:
330 # rc-* IDs are dynamic, must use classes
331 elem = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-nav-list")
332 tab_btn = elem.find_element(By.CLASS_NAME, "ant-tabs-tab-btn")
333 tab_btn.click()
334 pane = self.selenium_driver.find_element(By.CLASS_NAME, "ant-tabs-tabpane")
335 description = pane.text
336 except NoSuchElementException:
337 description = ""
339 return description
341 def scrape_dimension(self):
342 try:
343 dets_xpath ="//div[@class='ant-tabs-tab-btn'][text()='Details']"
344 btn = self.selenium_driver.find_element(By.XPATH, dets_xpath)
345 btn.click()
346 elem = self.selenium_driver.find_element(
347 By.XPATH, "//div[strong[contains(text(), 'Physical Dimensions:')]]")
348 t = elem.text
349 dimension = t.replace("Physical Dimensions:\n", "")
350 except NoSuchElementException:
351 dimension = ""
353 return dimension
355 def scrape_item_image_urls(self):
356 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
357 urls = []
358 try:
359 # main only
360 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
361 ec.presence_of_element_located((By.CLASS_NAME, "full-image"))
362 )
363 src = elem.get_attribute("src")
364 if src: 364 ↛ 367line 364 didn't jump to line 367, because the condition on line 364 was never false
365 urls.append(src)
366 # ensure we are seeing the top of the page
367 html = self.selenium_driver.find_element(By.TAG_NAME, "html")
368 html.send_keys(SeleniumKeys.PAGE_UP)
369 # image gallery for additional images
370 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "gallery-vert")
371 for elem in elems:
372 src = elem.get_attribute("src")
373 if src: 373 ↛ 371line 373 didn't jump to line 371, because the condition on line 373 was never false
374 urls.append(src)
375 except NoSuchElementException as e:
376 logging.warning(f"{namespace}: error {e}")
377 return urls
379 def add_to_cart(self, qty):
380 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
382 self.delay(1)
383 # try:???
384 stock_elem = self.selenium_driver.find_element(
385 By.XPATH, "//span[contains(text(), 'in stock')]"
386 )
387 m = re.search(r"([0-9]+) in stock", stock_elem.get_attribute("innerHTML"))
388 if m:
389 stock = m.group(1)
390 if int(stock) < int(qty): 390 ↛ 392line 390 didn't jump to line 392, because the condition on line 390 was never false
391 qty = stock
392 self.delay(1)
393 try:
394 # gather html elements needed
395 elems = self.selenium_driver.find_elements(By.CLASS_NAME, "ant-btn-primary")
396 button = None
397 for e in elems: 397 ↛ 401line 397 didn't jump to line 401, because the loop on line 397 didn't complete
398 if "Add to cart" in e.text: 398 ↛ 397line 398 didn't jump to line 397, because the condition on line 398 was never false
399 button = e
400 break
401 qty_field = self.selenium_driver.find_element(
402 By.XPATH,
403 (
404 "//input[@class='ant-input' and @type='text' "
405 "and not(ancestor::div[contains(@class, '-block')])]"
406 ),
407 )
408 # the qty field must be clicked to highlight amount. Clearing doesn't work
409 qty_field.click()
410 qty_field.send_keys(qty)
411 button.click()
412 except Exception as e:
413 logging.warning(f"{namespace}: error {e}")
414 return 0
415 return int(qty)
417 def load_cart_page(self):
418 namespace = f"{type(self).__name__}.{self.load_cart_page.__name__}"
419 try:
420 cart = "/checkout/cart"
421 url = self.base_url + cart
422 self.selenium_driver.get(url)
423 self.delay(1)
424 return True
425 except Exception as e:
426 logging.warning(f"{namespace}: error {e}")
427 return False
430class TBScraper(BaseScraper):
431 """
432 TBScraper objects know how to scrape TB item pages
433 """
435 def __init__(self, selenium_driver, base_url="https://texasbookman.com/"):
436 super().__init__(selenium_driver, base_url)
437 self.timeout = 3
438 self.login_xpath_query = "//a[@href='/admin']"
440 def load_item_page(self, item_number):
441 start = "p/"
442 url = self.base_url + start + item_number
443 self.selenium_driver.get(url)
444 return True
446 def scrape_description(self):
447 try:
448 elem = self.selenium_driver.find_element(
449 By.CLASS_NAME, "variant-description"
450 )
451 text = elem.text
452 description = text.replace("NO AMAZON SALES\n\n", "")
453 except NoSuchElementException:
454 description = ""
456 return description
458 def scrape_dimension(self):
459 try:
460 elem = self.selenium_driver.find_element(
461 By.CLASS_NAME, "full-description"
462 )
463 m = re.search(r"(Size:.+)\n", elem.text)
464 dimension = m.group(1).replace("Size:", "").strip()
465 except (NoSuchElementException, AttributeError):
466 dimension = ""
468 return dimension
470 def scrape_item_image_urls(self):
471 urls = []
472 try:
473 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
474 ec.presence_of_element_located((By.CLASS_NAME, "a-left"))
475 )
476 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture-thumbs")
477 left = elem.find_element(By.CLASS_NAME, "a-left")
478 left.click()
479 while True:
480 self.delay(2)
481 thumb = self._get_thumb_from_slimbox()
482 if thumb:
483 urls.append(thumb)
484 next_link = WebDriverWait(self.selenium_driver, self.timeout).until(
485 ec.presence_of_element_located((By.ID, "lbNextLink"))
486 )
487 self.delay(2)
488 next_link.click()
489 except (
490 NoSuchElementException,
491 ElementNotInteractableException,
492 TimeoutException,
493 ):
494 try:
495 elem = self.selenium_driver.find_element(By.CLASS_NAME, "picture")
496 img = elem.find_element(By.TAG_NAME, "img")
497 thumb = img.get_attribute("src")
498 urls.append(thumb)
499 except NoSuchElementException:
500 pass
502 return urls
504 def _get_thumb_from_slimbox(self):
505 timeout = 3
506 thumb = None
507 try:
508 img_div = WebDriverWait(self.selenium_driver, timeout).until(
509 ec.presence_of_element_located((By.ID, "lbImage"))
510 )
511 style = img_div.get_attribute("style")
512 m = re.search('"(.*)"', style)
513 if m: 513 ↛ 518line 513 didn't jump to line 518, because the condition on line 513 was never false
514 thumb = m.group(1)
515 except (NoSuchElementException, TimeoutException):
516 pass
518 return thumb
520 def load_login_page(self):
521 login = "login"
522 url = self.base_url + login
523 self.selenium_driver.get(url)
525 def impersonate(self, email):
526 namespace = f"{type(self).__name__}.{self.impersonate.__name__}"
528 # Go to /Admin/Customer/List
529 customers = "/Admin/Customer/List"
530 url = self.base_url + customers
531 self.selenium_driver.get(url)
532 self.delay(1)
533 try:
534 # search for email
535 search_email = WebDriverWait(self.selenium_driver, self.timeout).until(
536 ec.presence_of_element_located((By.ID, "SearchEmail"))
537 )
538 search_email.clear()
539 search_email.send_keys(email + SeleniumKeys.ENTER)
540 # Get customer link associated with email
541 email_xpath = (
542 f"//div[@id='customers-grid']/table/tbody/tr/td/a[text()='{email}']"
543 )
544 customer_link = WebDriverWait(self.selenium_driver, self.timeout).until(
545 ec.presence_of_element_located((By.XPATH, email_xpath))
546 )
547 links = self.selenium_driver.find_elements(By.XPATH, email_xpath)
548 # Bail if multiple customer records for given email.
549 if len(links) > 1:
550 logging.error(
551 f"{namespace}: Found multiple customer records for email "
552 f"'{email}' to impersonate"
553 )
554 logging.error(f"{namespace}: Cannot proceed. Exiting.")
555 raise Exception
556 customer_link.click()
557 # click "Place Order (impersonate)"
558 impersonate = WebDriverWait(self.selenium_driver, self.timeout).until(
559 ec.presence_of_element_located(
560 (By.XPATH, "//a[text()='Place order (Impersonate)']")
561 )
562 )
563 impersonate.click()
564 # click "Place Order" button
565 button = WebDriverWait(self.selenium_driver, self.timeout).until(
566 ec.presence_of_element_located(
567 (By.XPATH, "//input[@name='impersonate']")
568 )
569 )
570 button.click()
571 self.delay(1)
572 WebDriverWait(self.selenium_driver, self.timeout).until(
573 ec.presence_of_element_located((By.CLASS_NAME, "finish-impersonation"))
574 )
575 except (NoSuchElementException, TimeoutException) as e:
576 logging.error(f"{namespace}: failed to impersonate")
577 logging.error(f"{namespace}: Cannot proceed. Exiting.")
578 raise e
579 return True
581 def add_to_cart(self, qty):
582 namespace = f"{type(self).__name__}.{self.add_to_cart.__name__}"
584 qty = int(qty)
585 self.delay(1)
586 stock_elem = self.selenium_driver.find_element(By.CLASS_NAME, "stock")
587 m = re.search(r"Availability: ([0-9]+) in stock", stock_elem.text)
588 if m: 588 ↛ 593line 588 didn't jump to line 593, because the condition on line 588 was never false
589 stock = m.group(1)
590 stock = int(stock)
591 if stock < qty:
592 qty = stock
593 try:
594 # gather html elements needed
595 qty_field = WebDriverWait(self.selenium_driver, self.timeout).until(
596 ec.presence_of_element_located((By.CLASS_NAME, "qty-input"))
597 )
598 button = self.selenium_driver.find_element(
599 By.CLASS_NAME, "add-to-cart-button"
600 )
601 qty_field.clear()
602 # ENTERing out of the qty_field DOES NOT add to cart.
603 # The button must be clicked instead.
604 qty_field.send_keys(qty)
605 button.click()
606 self.delay(1)
607 except Exception as e:
608 logging.warning(f"{namespace}: error {e}")
609 return 0
610 return qty
612 def load_cart_page(self):
613 cart = "cart"
614 url = self.base_url + cart
615 self.selenium_driver.get(url)
616 return True
618 def search_item_num(self, search):
619 namespace = f"{type(self).__name__}.{self.search_item_num.__name__}"
621 item_num = ""
622 search = urllib.parse.quote_plus(search)
623 url = self.base_url + "search?q=" + search
624 self.selenium_driver.get(url)
625 self.delay(2)
626 timeout_bak = self.timeout
627 self.timeout = 120
628 WebDriverWait(self.selenium_driver, self.timeout).until(
629 ec.presence_of_element_located((By.CLASS_NAME, "search-results"))
630 )
631 self.timeout = timeout_bak
632 links = self.selenium_driver.find_elements(
633 By.XPATH, "//div[@class='search-results']//a[contains(@href, '/p/')]"
634 )
635 if links: 635 ↛ 643line 635 didn't jump to line 643, because the condition on line 635 was never false
636 item_urls = [x.get_attribute("href") for x in links]
637 for item_url in item_urls:
638 m = re.search(r"\/p\/([0-9]+)\/(?!uk-)", item_url)
639 if m:
640 item_num = m.group(1)
641 break
642 else:
643 logging.warning(f"{namespace}: Failed to find item using q='{search}'")
644 return item_num
647class AmznScraper(BaseScraper):
648 """
649 AmznScraper objects know how to scrape amazon item pages
650 """
652 def __init__(self, selenium_driver, base_url="https://www.amazon.com/"):
653 super().__init__(selenium_driver, base_url)
654 self.timeout = 1
655 self.captcha_link = self.base_url + "/errors/validateCaptcha"
657 def solve_captcha(self, link=None):
658 from amazoncaptcha import AmazonCaptcha
660 if not link: 660 ↛ 661line 660 didn't jump to line 661, because the condition on line 660 was never true
661 link = self.captcha_link
662 try:
663 self.selenium_driver.get(link)
664 captcha = AmazonCaptcha.fromdriver(self.selenium_driver)
665 solution = captcha.solve()
666 if solution.lower() == "not solved":
667 raise(NoSuchElementException)
668 return solution
669 except (NoSuchElementException, TimeoutException):
670 return ""
672 def enter_captcha(self, solution):
673 namespace = (
674 f"{type(self).__name__}.{self.enter_captcha.__name__}"
675 )
676 if solution:
677 elem = self.selenium_driver.find_element(By.ID, "captchacharacters")
678 elem.send_keys(solution + SeleniumKeys.ENTER)
679 else:
680 input_text = Text(
681 """
682 ******** USER INPUT REQUIRED ********
683 Locate the selenium controlled browser
684 and manually enter the requested CAPTCHA characters.
685 ******** WAITING FOR USER INPUT ********
686 """
687 )
688 input_text.stylize("bold cyan")
689 console.print(input_text)
690 try:
691 timeout_bak = self.timeout
692 self.timeout = self.timeout * 100
693 WebDriverWait(self.selenium_driver, self.timeout).until(
694 ec.presence_of_element_located(
695 (By.XPATH, "//a[@href='/ref=nav_logo']"))
696 )
697 self.timeout = timeout_bak
698 success_text = Text(
699 """
700 ******** CAPTCHA SUCCESSFUL ********
701 ******** CONTINUING EXECUTION ********
702 """
703 )
704 success_text.stylize("green")
705 console.print(success_text)
706 except (NoSuchElementException, TimeoutException):
707 logging.error(f"{namespace}: failed CAPTCHA")
709 def load_item_page(self, item_number):
710 start = "dp/"
711 url = self.base_url + start + item_number
712 self.selenium_driver.get(url)
713 return True
715 def scrape_description(self):
716 description = ""
717 description = self._scrape_amazon_editorial_review()
718 if not description:
719 description = self._scrape_amazon_description()
721 return description
723 def scrape_dimension(self):
724 dimension = ""
725 try:
726 xpath = "//span/span[contains(text(), 'Dimensions')]//following::span"
727 elem = self.selenium_driver.find_element(By.XPATH, xpath)
728 dimension = elem.get_attribute("innerHTML")
729 except NoSuchElementException:
730 dimension = ""
731 return dimension
733 def _scrape_amazon_editorial_review(self):
734 descr = ""
735 try:
736 elem = self.selenium_driver.find_element(
737 By.ID, "editorialReviews_feature_div"
738 )
739 text = elem.text
740 descr_lines = re.split("^.*\\n.*\\n", text) # trim off first two lines
741 descr = descr_lines[-1]
742 except NoSuchElementException:
743 descr = ""
745 return descr
747 def _scrape_amazon_description(self):
748 descr = ""
749 try:
750 elem = self.selenium_driver.find_element(
751 By.ID, "bookDescription_feature_div"
752 )
753 # read_more = elem.find_element(By.CLASS_NAME, 'a-expander-prompt')
754 # read_more.click()
755 descr = elem.text
756 except NoSuchElementException:
757 descr = ""
759 return descr
761 def get_span_type_thumb_id_prefix(self):
762 """Get span_type and thumb_id_prefix from amazon images widget."""
763 namespace = (
764 f"{type(self).__name__}.{self.get_span_type_thumb_id_prefix.__name__}"
765 )
766 span_type = None
767 thumb_id_prefix = None
768 try:
769 span = WebDriverWait(self.selenium_driver, self.timeout).until(
770 ec.presence_of_element_located((By.ID, "imgThumbs"))
771 )
772 span_type = "imgThumbs"
773 except (NoSuchElementException, TimeoutException):
774 logging.info(f"{namespace}: No imgThumbs id, trying imgTagWrapperID")
775 try:
776 span = WebDriverWait(self.selenium_driver, self.timeout).until(
777 ec.presence_of_element_located((By.ID, "imgTagWrapperId"))
778 )
779 span_type = "imgTagWrapperId"
780 except (NoSuchElementException, TimeoutException):
781 logging.info(f"{namespace}: No imgTagWrapperId id")
782 logging.info(f"{namespace}: Returning empty urls list")
783 return (span_type, thumb_id_prefix)
785 if span_type == "imgThumbs": 785 ↛ 789line 785 didn't jump to line 789, because the condition on line 785 was never false
786 link = span.find_element(By.CLASS_NAME, "a-link-normal")
787 thumb_id_prefix = "ig-thumb-"
788 else:
789 link = span
790 thumb_id_prefix = "ivImage_"
791 try:
792 link.click()
793 except ElementClickInterceptedException:
794 logging.info(f"{namespace}: Failed to click images widget")
795 logging.info(f"{namespace}: Returning empty urls list")
796 return (span_type, thumb_id_prefix)
797 return (span_type, thumb_id_prefix)
799 def scrape_item_image_urls(self):
800 namespace = f"{type(self).__name__}.{self.scrape_item_image_urls.__name__}"
801 counter = 0
802 urls = []
804 span_type, thumb_id_prefix = self.get_span_type_thumb_id_prefix()
805 if thumb_id_prefix: 805 ↛ 835line 805 didn't jump to line 835, because the condition on line 805 was never false
806 logging.debug(f"{namespace}: Clicked images widget")
807 # get image urls
808 while True:
809 try:
810 thumb = ""
811 xpath = f"//*[@id='{thumb_id_prefix}{counter}']"
812 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
813 ec.presence_of_element_located((By.XPATH, xpath))
814 )
815 if span_type == "imgThumbs": 815 ↛ 817line 815 didn't jump to line 817, because the condition on line 815 was never false
816 thumb = elem.get_attribute("src")
817 if span_type == "imgTagWrapperId": 817 ↛ 818line 817 didn't jump to line 818, because the condition on line 817 was never true
818 inner_elem = elem.find_element(By.CLASS_NAME, "ivThumbImage")
819 style = inner_elem.get_attribute("style")
820 m = re.search('"(.*)"', style)
821 if m:
822 thumb = m.group(1)
823 sub, suff = os.path.splitext(thumb)
824 indx = sub.find("._")
825 url = sub[:indx] + suff
826 if url: 826 ↛ 828line 826 didn't jump to line 828, because the condition on line 826 was never false
827 urls.append(url)
828 logging.debug(f"{namespace}: Thumbnail src is {thumb}")
829 logging.debug(f"{namespace}: Full size URL is %r" % url)
830 counter += 1
831 except (NoSuchElementException, TimeoutException):
832 break
833 # amazon adds stupid human holding book images
834 # remove this
835 if len(urls) > 1: 835 ↛ 838line 835 didn't jump to line 838, because the condition on line 835 was never false
836 urls.pop()
838 return urls
841class AmznUkScraper(AmznScraper):
842 """
843 AmznUkScraper objects know how to scrape amazon.co.uk item pages
844 """
846 def __init__(self, selenium_driver, base_url="https://www.amazon.co.uk/"):
847 super().__init__(selenium_driver, base_url)
849 def decline_cookies(self):
850 try:
851 decline_button = self.selenium_driver.find_element(
852 By.ID, "sp-cc-rejectall-link"
853 )
854 decline_button.click()
855 self.delay(2)
856 return True
857 except (NoSuchElementException, TimeoutException):
858 return False
860 def load_item_page(self, isbn):
861 # Search by ISBN
862 start = "s?isbn="
863 url = self.base_url + start + isbn
864 self.selenium_driver.get(url)
865 self.decline_cookies()
866 # Look for results
867 elem = WebDriverWait(self.selenium_driver, self.timeout).until(
868 ec.presence_of_element_located((By.CLASS_NAME, "s-result-list"))
869 )
870 # Get ASIN from first result
871 inner_e = elem.find_element(By.CLASS_NAME, "a-link-normal")
872 link = inner_e.get_attribute("href")
873 m = re.search(r"\/dp\/([0-9A-Z]+)/", link)
874 asin = m.group(1) if m else ""
875 # Load ASIN page
876 if asin:
877 start = "dp/"
878 url = self.base_url + start + asin
879 self.selenium_driver.get(url)
880 return True
881 return False
884##############################################################################
885# utility functions
886##############################################################################
887def get_headless_driver():
888 return get_driver("--headless=new")
891def get_driver(option_args: str = ""):
892 """Creates a new instance of the chrome driver.
894 :param option_args:
895 Option arguments to pass to the driver
896 :returns: selenium.webdriver object
897 """
898 namespace = f"{MODULE}.{get_driver.__name__}"
899 service = ChromeService()
900 options = webdriver.ChromeOptions()
901 if option_args: 901 ↛ 902line 901 didn't jump to line 902, because the condition on line 901 was never true
902 options.add_argument(option_args)
903 logging.info(f"{namespace}: Setting webdriver option to '{option_args}'.")
904 driver = webdriver.Chrome(service=service, options=options)
905 return driver
908def scrape_item(scrapr, item_id, description="", dimension="", image_urls=None):
909 if image_urls is None:
910 image_urls = []
911 namespace = f"{MODULE}.{scrape_item.__name__}"
912 scrapr.load_item_page(item_id)
913 logging.info(
914 f"{namespace}: Getting item image urls via {scrapr.__class__.__name__}"
915 )
916 l_image_urls = scrapr.scrape_item_image_urls()
917 if image_urls and len(l_image_urls) > 1: 917 ↛ 918line 917 didn't jump to line 918, because the condition on line 917 was never true
918 l_image_urls.pop(0)
919 image_urls = image_urls + l_image_urls
920 logging.info(" URLs: %r" % image_urls)
921 if image_urls and not description: 921 ↛ 927line 921 didn't jump to line 927, because the condition on line 921 was never false
922 logging.info(
923 f"{namespace}: Getting description via {scrapr.__class__.__name__}"
924 )
925 description = scrapr.scrape_description()
926 logging.info(" Description: %r" % description[:140])
927 if image_urls and not dimension: 927 ↛ 933line 927 didn't jump to line 933, because the condition on line 927 was never false
928 logging.info(
929 f"{namespace}: Getting dimension via {scrapr.__class__.__name__}"
930 )
931 dimension = scrapr.scrape_dimension()
932 logging.info(" Dimension: %r" % dimension[:140])
933 return description, dimension, image_urls
936def get_failover_scraper_item_id(driver, vendr, item):
937 namespace = f"{MODULE}.{get_failover_scraper_item_id.__name__}"
938 failover_scrapr = None
939 item_id = item.isbn
940 if vendr.vendor_code == "tb":
941 try:
942 url = item.data["LINK"]
943 m = re.search(r"\/([0-9]+)\/", url)
944 if m: 944 ↛ 948line 944 didn't jump to line 948, because the condition on line 944 was never false
945 item_id = m.group(1)
946 except KeyError:
947 logging.error(f"{namespace}: No link found in item")
948 if vendr.failover_scraper in globals():
949 failover_scrapr= globals()[vendr.failover_scraper](driver)
950 return failover_scrapr, item_id
953def main(vendor_code, sheet_id, worksheet, scraped_items_db): # noqa: C901
954 namespace = f"{MODULE}.{main.__name__}"
955 # get vendor info from database
956 logging.debug(f"{namespace}: Instantiate vendor.")
957 vendr = vendor.Vendor(vendor_code)
958 vendr.set_vendor_data()
960 sheet_data = spreadsheet.get_sheet_data(sheet_id, worksheet)
962 sheet_keys = [x for x in sheet_data.pop(0) if x] # filter out None
963 items_obj = Items(sheet_keys, sheet_data, vendr.isbn_key)
964 items_obj.load_scraped_data(scraped_items_db)
965 driver = None
966 prime_scrapr = None
967 failover_scrapr = None
968 for item in items_obj:
969 if not item.isbn: 969 ↛ 970line 969 didn't jump to line 970, because the condition on line 969 was never true
970 if "TBCODE" in item.data:
971 item.isbn = item.data["TBCODE"]
972 if not item.isbn:
973 logging.info(f"{namespace}: No isbn for item, skipping lookup")
974 continue
975 description = ""
976 dimension = ""
977 image_urls = []
978 # if scraped_item image_urls is not empty:
979 # skip scraped_item
980 logging.info(f"{namespace}: Searching for {item.isbn} ...")
981 if item.image_urls != []: 981 ↛ 982line 981 didn't jump to line 982, because the condition on line 981 was never true
982 logging.info(f"{namespace}: {item.isbn} found in database, skipping")
983 continue
985 if not driver and not prime_scrapr: 985 ↛ 996line 985 didn't jump to line 996, because the condition on line 985 was never false
986 logging.info(f"{namespace}: Opening browser...")
987 if CFG["asg"]["scraper"]["headless"]: 987 ↛ 988line 987 didn't jump to line 988, because the condition on line 987 was never true
988 driver = get_headless_driver()
989 else:
990 driver = get_driver()
991 prime_scrapr = AmznScraper(driver)
992 solution = prime_scrapr.solve_captcha()
993 prime_scrapr.enter_captcha(solution)
996 logging.info(f"{namespace}: No scraped data currently: {item.isbn}")
997 description, dimension, image_urls = scrape_item(
998 prime_scrapr, item.isbn10, description, dimension, image_urls
999 )
1000 if len(image_urls) < IMG_FAILOVER_THRESHHOLD: 1000 ↛ 1001line 1000 didn't jump to line 1001, because the condition on line 1000 was never true
1001 failover_scrapr, item_id = get_failover_scraper_item_id(
1002 driver, vendr, item
1003 )
1004 if failover_scrapr:
1005 if isinstance(failover_scrapr, AmznScraper):
1006 solution = failover_scrapr.solve_captcha()
1007 failover_scrapr.enter_captcha(solution)
1008 description, dimension, image_urls = scrape_item(
1009 failover_scrapr, item_id, description, dimension, image_urls
1010 )
1012 item.data["DESCRIPTION"] = description
1013 item.data["DIMENSION"] = dimension
1014 item.image_urls = image_urls
1016 logging.info(f"{namespace}: Saving scraped item data")
1017 items_obj.save_scraped_data(scraped_items_db)
1018 if driver: 1018 ↛ exitline 1018 didn't return from function 'main', because the condition on line 1018 was never false
1019 logging.info(f"{namespace}: Closing browser...")
1020 driver.quit()