gruel.gruel
1import inspect 2import time 3from typing import Any 4 5import loggi 6import requests 7from bs4 import BeautifulSoup, Tag 8from noiftimer import Timer 9from pathier import Pathier 10from printbuddies import ProgBar 11from whosyouragent import get_agent 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class.""" 18 19 def __init__(self, name: str | None = None): 20 self._name = name 21 self._init_logger() 22 self.timer = Timer() 23 self.success_count = 0 24 self.fail_count = 0 25 26 @property 27 def name(self) -> str: 28 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 29 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 30 31 def _init_logger(self): 32 log_dir = Pathier.cwd() / "gruel_logs" 33 log_dir.mkdir() 34 self.logger = loggi.getLogger(self.name) 35 36 def get_page( 37 self, url: str, method: str = "get", headers: dict[str, str] = {} 38 ) -> requests.Response: 39 """Request `url` and return the `requests.Response` object. 40 41 By default, the only header sent is a randomized user agent string. 42 43 This can be overridden by supplying a user agent in the `headers` param.""" 44 try: 45 return requests.request( 46 method, url, headers={"User-Agent": get_agent()} | headers 47 ) 48 except Exception as e: 49 time.sleep(1) 50 return requests.request( 51 method, url, headers={"User-Agent": get_agent()} | headers 52 ) 53 54 def as_soup(self, response: requests.Response) -> BeautifulSoup: 55 """Returns the text content of `response` as a `BeautifulSoup` object.""" 56 return BeautifulSoup(response.text, "html.parser") 57 58 def get_soup( 59 self, url: str, method: str = "get", headers: dict[str, str] = {} 60 ) -> BeautifulSoup: 61 """Request `url` with `headers` and return `BeautifulSoup` object.""" 62 return self.as_soup(self.get_page(url, method, headers)) 63 64 def clean_string(self, text: str) -> str: 65 """Strip `\\n\\r\\t` and whitespace from `text`.""" 66 return text.strip(" \n\t\r") 67 68 def prescrape_chores(self): 69 """Chores to do before scraping.""" 70 ... 71 72 def postscrape_chores(self): 73 """Chores to do after scraping.""" 74 ... 75 76 def get_parsable_items(self) -> list[ParsableItem]: 77 """Get relevant webpages and extract raw data that needs to be parsed. 78 79 e.g. first 10 results for an endpoint that returns json content 80 >>> return self.get_page(some_url).json()[:10]""" 81 raise NotImplementedError 82 83 def parse_item(self, item: ParsableItem) -> Any: 84 """Parse `item` and return parsed data. 85 86 e.g. 87 >>> try: 88 >>> parsed = {} 89 >>> parsed["thing1"] = item["element"].split()[0] 90 >>> self.successes += 1 91 >>> return parsed 92 >>> except Exception: 93 >>> self.logger.exception("message") 94 >>> self.failures += 1 95 >>> return None""" 96 raise NotImplementedError 97 98 def store_item(self, item: Any): 99 """Store `item`.""" 100 raise NotImplementedError 101 102 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 103 for item in parsable_items: 104 parsed_item = self.parse_item(item) 105 if parsed_item: 106 self.store_item(parsed_item) 107 108 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 109 with ProgBar(len(parsable_items)) as bar: 110 for item in parsable_items: 111 parsed_item = self.parse_item(item) 112 if parsed_item: 113 self.store_item(parsed_item) 114 bar.display(f"{bar.runtime}") 115 116 def scrape(self, parse_items_prog_bar_display: bool = False): 117 """Run the scraper: 118 1. prescrape chores 119 2. get parsable items 120 3. parse items 121 4. store items 122 5. postscrape chores""" 123 try: 124 self.timer.start() 125 self.logger.info("Scrape started.") 126 self.prescrape_chores() 127 try: 128 parsable_items = self.get_parsable_items() 129 self.logger.info( 130 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 131 ) 132 except Exception: 133 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 134 else: 135 if parse_items_prog_bar_display: 136 self._parse_items_prog_bar(parsable_items) 137 else: 138 self._parse_items_no_prog_bar(parsable_items) 139 self.logger.info( 140 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 141 ) 142 except Exception: 143 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 144 self.postscrape_chores()
class
Gruel:
17class Gruel: 18 """Scraper base class.""" 19 20 def __init__(self, name: str | None = None): 21 self._name = name 22 self._init_logger() 23 self.timer = Timer() 24 self.success_count = 0 25 self.fail_count = 0 26 27 @property 28 def name(self) -> str: 29 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 30 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 31 32 def _init_logger(self): 33 log_dir = Pathier.cwd() / "gruel_logs" 34 log_dir.mkdir() 35 self.logger = loggi.getLogger(self.name) 36 37 def get_page( 38 self, url: str, method: str = "get", headers: dict[str, str] = {} 39 ) -> requests.Response: 40 """Request `url` and return the `requests.Response` object. 41 42 By default, the only header sent is a randomized user agent string. 43 44 This can be overridden by supplying a user agent in the `headers` param.""" 45 try: 46 return requests.request( 47 method, url, headers={"User-Agent": get_agent()} | headers 48 ) 49 except Exception as e: 50 time.sleep(1) 51 return requests.request( 52 method, url, headers={"User-Agent": get_agent()} | headers 53 ) 54 55 def as_soup(self, response: requests.Response) -> BeautifulSoup: 56 """Returns the text content of `response` as a `BeautifulSoup` object.""" 57 return BeautifulSoup(response.text, "html.parser") 58 59 def get_soup( 60 self, url: str, method: str = "get", headers: dict[str, str] = {} 61 ) -> BeautifulSoup: 62 """Request `url` with `headers` and return `BeautifulSoup` object.""" 63 return self.as_soup(self.get_page(url, method, headers)) 64 65 def clean_string(self, text: str) -> str: 66 """Strip `\\n\\r\\t` and whitespace from `text`.""" 67 return text.strip(" \n\t\r") 68 69 def prescrape_chores(self): 70 """Chores to do before scraping.""" 71 ... 72 73 def postscrape_chores(self): 74 """Chores to do after scraping.""" 75 ... 76 77 def get_parsable_items(self) -> list[ParsableItem]: 78 """Get relevant webpages and extract raw data that needs to be parsed. 79 80 e.g. first 10 results for an endpoint that returns json content 81 >>> return self.get_page(some_url).json()[:10]""" 82 raise NotImplementedError 83 84 def parse_item(self, item: ParsableItem) -> Any: 85 """Parse `item` and return parsed data. 86 87 e.g. 88 >>> try: 89 >>> parsed = {} 90 >>> parsed["thing1"] = item["element"].split()[0] 91 >>> self.successes += 1 92 >>> return parsed 93 >>> except Exception: 94 >>> self.logger.exception("message") 95 >>> self.failures += 1 96 >>> return None""" 97 raise NotImplementedError 98 99 def store_item(self, item: Any): 100 """Store `item`.""" 101 raise NotImplementedError 102 103 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 104 for item in parsable_items: 105 parsed_item = self.parse_item(item) 106 if parsed_item: 107 self.store_item(parsed_item) 108 109 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 110 with ProgBar(len(parsable_items)) as bar: 111 for item in parsable_items: 112 parsed_item = self.parse_item(item) 113 if parsed_item: 114 self.store_item(parsed_item) 115 bar.display(f"{bar.runtime}") 116 117 def scrape(self, parse_items_prog_bar_display: bool = False): 118 """Run the scraper: 119 1. prescrape chores 120 2. get parsable items 121 3. parse items 122 4. store items 123 5. postscrape chores""" 124 try: 125 self.timer.start() 126 self.logger.info("Scrape started.") 127 self.prescrape_chores() 128 try: 129 parsable_items = self.get_parsable_items() 130 self.logger.info( 131 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 132 ) 133 except Exception: 134 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 135 else: 136 if parse_items_prog_bar_display: 137 self._parse_items_prog_bar(parsable_items) 138 else: 139 self._parse_items_no_prog_bar(parsable_items) 140 self.logger.info( 141 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 142 ) 143 except Exception: 144 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 145 self.postscrape_chores()
Scraper base class.
name: str
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
def
get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
37 def get_page( 38 self, url: str, method: str = "get", headers: dict[str, str] = {} 39 ) -> requests.Response: 40 """Request `url` and return the `requests.Response` object. 41 42 By default, the only header sent is a randomized user agent string. 43 44 This can be overridden by supplying a user agent in the `headers` param.""" 45 try: 46 return requests.request( 47 method, url, headers={"User-Agent": get_agent()} | headers 48 ) 49 except Exception as e: 50 time.sleep(1) 51 return requests.request( 52 method, url, headers={"User-Agent": get_agent()} | headers 53 )
Request url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
def
as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
55 def as_soup(self, response: requests.Response) -> BeautifulSoup: 56 """Returns the text content of `response` as a `BeautifulSoup` object.""" 57 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
def
get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
59 def get_soup( 60 self, url: str, method: str = "get", headers: dict[str, str] = {} 61 ) -> BeautifulSoup: 62 """Request `url` with `headers` and return `BeautifulSoup` object.""" 63 return self.as_soup(self.get_page(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
def
clean_string(self, text: str) -> str:
65 def clean_string(self, text: str) -> str: 66 """Strip `\\n\\r\\t` and whitespace from `text`.""" 67 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
def
get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
77 def get_parsable_items(self) -> list[ParsableItem]: 78 """Get relevant webpages and extract raw data that needs to be parsed. 79 80 e.g. first 10 results for an endpoint that returns json content 81 >>> return self.get_page(some_url).json()[:10]""" 82 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
def
parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
84 def parse_item(self, item: ParsableItem) -> Any: 85 """Parse `item` and return parsed data. 86 87 e.g. 88 >>> try: 89 >>> parsed = {} 90 >>> parsed["thing1"] = item["element"].split()[0] 91 >>> self.successes += 1 92 >>> return parsed 93 >>> except Exception: 94 >>> self.logger.exception("message") 95 >>> self.failures += 1 96 >>> return None""" 97 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
def
scrape(self, parse_items_prog_bar_display: bool = False):
117 def scrape(self, parse_items_prog_bar_display: bool = False): 118 """Run the scraper: 119 1. prescrape chores 120 2. get parsable items 121 3. parse items 122 4. store items 123 5. postscrape chores""" 124 try: 125 self.timer.start() 126 self.logger.info("Scrape started.") 127 self.prescrape_chores() 128 try: 129 parsable_items = self.get_parsable_items() 130 self.logger.info( 131 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 132 ) 133 except Exception: 134 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 135 else: 136 if parse_items_prog_bar_display: 137 self._parse_items_prog_bar(parsable_items) 138 else: 139 self._parse_items_no_prog_bar(parsable_items) 140 self.logger.info( 141 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 142 ) 143 except Exception: 144 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 145 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse items
- store items
- postscrape chores