gruel.gruel
1import inspect 2import logging 3import time 4from typing import Any 5 6import requests 7from bs4 import BeautifulSoup, Tag 8from noiftimer import Timer 9from pathier import Pathier 10from printbuddies import ProgBar 11from whosyouragent import get_agent 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class.""" 18 19 def __init__(self, name: str | None = None): 20 self._name = name 21 self._init_logger() 22 self.timer = Timer() 23 self.success_count = 0 24 self.fail_count = 0 25 26 @property 27 def name(self) -> str: 28 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 29 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 30 31 def _init_logger(self): 32 log_dir = Pathier.cwd() / "logs" 33 log_dir.mkdir() 34 self.logger = logging.getLogger(self.name) 35 if not self.logger.hasHandlers(): 36 handler = logging.FileHandler( 37 (log_dir / self.name).with_suffix(".log"), encoding="utf-8" 38 ) 39 handler.setFormatter( 40 logging.Formatter( 41 "{levelname}|-|{asctime}|-|{message}", 42 style="{", 43 datefmt="%m/%d/%Y %I:%M:%S %p", 44 ) 45 ) 46 self.logger.addHandler(handler) 47 self.logger.setLevel(logging.INFO) 48 49 def get_page( 50 self, url: str, method: str = "get", headers: dict[str, str] = {} 51 ) -> requests.Response: 52 """Request `url` and return the `requests.Response` object. 53 54 By default, the only header sent is a randomized user agent string. 55 56 This can be overridden by supplying a user agent in the `headers` param.""" 57 try: 58 return requests.request( 59 method, url, headers={"User-Agent": get_agent()} | headers 60 ) 61 except Exception as e: 62 time.sleep(1) 63 return requests.request( 64 method, url, headers={"User-Agent": get_agent()} | headers 65 ) 66 67 def as_soup(self, response: requests.Response) -> BeautifulSoup: 68 """Returns the text content of `response` as a `BeautifulSoup` object.""" 69 return BeautifulSoup(response.text, "html.parser") 70 71 def get_soup( 72 self, url: str, method: str = "get", headers: dict[str, str] = {} 73 ) -> BeautifulSoup: 74 """Request `url` with `headers` and return `BeautifulSoup` object.""" 75 return self.as_soup(self.get_page(url, method, headers)) 76 77 def clean_string(self, text: str) -> str: 78 """Strip `\\n\\r\\t` and whitespace from `text`.""" 79 return text.strip(" \n\t\r") 80 81 def prescrape_chores(self): 82 """Chores to do before scraping.""" 83 ... 84 85 def postscrape_chores(self): 86 """Chores to do after scraping.""" 87 ... 88 89 def get_parsable_items(self) -> list[ParsableItem]: 90 """Get relevant webpages and extract raw data that needs to be parsed. 91 92 e.g. first 10 results for an endpoint that returns json content 93 >>> return self.get_page(some_url).json()[:10]""" 94 raise NotImplementedError 95 96 def parse_item(self, item: ParsableItem) -> Any: 97 """Parse `item` and return parsed data. 98 99 e.g. 100 >>> try: 101 >>> parsed = {} 102 >>> parsed["thing1"] = item["element"].split()[0] 103 >>> self.successes += 1 104 >>> return parsed 105 >>> except Exception: 106 >>> self.logger.exception("message") 107 >>> self.failures += 1 108 >>> return None""" 109 raise NotImplementedError 110 111 def store_item(self, item: Any): 112 """Store `item`.""" 113 raise NotImplementedError 114 115 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 116 for item in parsable_items: 117 parsed_item = self.parse_item(item) 118 if parsed_item: 119 self.store_item(parsed_item) 120 121 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 122 with ProgBar(len(parsable_items)) as bar: 123 for item in parsable_items: 124 parsed_item = self.parse_item(item) 125 if parsed_item: 126 self.store_item(parsed_item) 127 bar.display(f"{bar.runtime}") 128 129 def scrape(self, parse_items_prog_bar_display: bool = False): 130 """Run the scraper: 131 1. prescrape chores 132 2. get parsable items 133 3. parse items 134 4. store items 135 5. postscrape chores""" 136 try: 137 self.timer.start() 138 self.logger.info("Scrape started.") 139 self.prescrape_chores() 140 try: 141 parsable_items = self.get_parsable_items() 142 self.logger.info( 143 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 144 ) 145 except Exception: 146 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 147 else: 148 if parse_items_prog_bar_display: 149 self._parse_items_prog_bar(parsable_items) 150 else: 151 self._parse_items_no_prog_bar(parsable_items) 152 self.logger.info( 153 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 154 ) 155 except Exception: 156 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 157 self.postscrape_chores()
class
Gruel:
17class Gruel: 18 """Scraper base class.""" 19 20 def __init__(self, name: str | None = None): 21 self._name = name 22 self._init_logger() 23 self.timer = Timer() 24 self.success_count = 0 25 self.fail_count = 0 26 27 @property 28 def name(self) -> str: 29 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 30 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 31 32 def _init_logger(self): 33 log_dir = Pathier.cwd() / "logs" 34 log_dir.mkdir() 35 self.logger = logging.getLogger(self.name) 36 if not self.logger.hasHandlers(): 37 handler = logging.FileHandler( 38 (log_dir / self.name).with_suffix(".log"), encoding="utf-8" 39 ) 40 handler.setFormatter( 41 logging.Formatter( 42 "{levelname}|-|{asctime}|-|{message}", 43 style="{", 44 datefmt="%m/%d/%Y %I:%M:%S %p", 45 ) 46 ) 47 self.logger.addHandler(handler) 48 self.logger.setLevel(logging.INFO) 49 50 def get_page( 51 self, url: str, method: str = "get", headers: dict[str, str] = {} 52 ) -> requests.Response: 53 """Request `url` and return the `requests.Response` object. 54 55 By default, the only header sent is a randomized user agent string. 56 57 This can be overridden by supplying a user agent in the `headers` param.""" 58 try: 59 return requests.request( 60 method, url, headers={"User-Agent": get_agent()} | headers 61 ) 62 except Exception as e: 63 time.sleep(1) 64 return requests.request( 65 method, url, headers={"User-Agent": get_agent()} | headers 66 ) 67 68 def as_soup(self, response: requests.Response) -> BeautifulSoup: 69 """Returns the text content of `response` as a `BeautifulSoup` object.""" 70 return BeautifulSoup(response.text, "html.parser") 71 72 def get_soup( 73 self, url: str, method: str = "get", headers: dict[str, str] = {} 74 ) -> BeautifulSoup: 75 """Request `url` with `headers` and return `BeautifulSoup` object.""" 76 return self.as_soup(self.get_page(url, method, headers)) 77 78 def clean_string(self, text: str) -> str: 79 """Strip `\\n\\r\\t` and whitespace from `text`.""" 80 return text.strip(" \n\t\r") 81 82 def prescrape_chores(self): 83 """Chores to do before scraping.""" 84 ... 85 86 def postscrape_chores(self): 87 """Chores to do after scraping.""" 88 ... 89 90 def get_parsable_items(self) -> list[ParsableItem]: 91 """Get relevant webpages and extract raw data that needs to be parsed. 92 93 e.g. first 10 results for an endpoint that returns json content 94 >>> return self.get_page(some_url).json()[:10]""" 95 raise NotImplementedError 96 97 def parse_item(self, item: ParsableItem) -> Any: 98 """Parse `item` and return parsed data. 99 100 e.g. 101 >>> try: 102 >>> parsed = {} 103 >>> parsed["thing1"] = item["element"].split()[0] 104 >>> self.successes += 1 105 >>> return parsed 106 >>> except Exception: 107 >>> self.logger.exception("message") 108 >>> self.failures += 1 109 >>> return None""" 110 raise NotImplementedError 111 112 def store_item(self, item: Any): 113 """Store `item`.""" 114 raise NotImplementedError 115 116 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 117 for item in parsable_items: 118 parsed_item = self.parse_item(item) 119 if parsed_item: 120 self.store_item(parsed_item) 121 122 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 123 with ProgBar(len(parsable_items)) as bar: 124 for item in parsable_items: 125 parsed_item = self.parse_item(item) 126 if parsed_item: 127 self.store_item(parsed_item) 128 bar.display(f"{bar.runtime}") 129 130 def scrape(self, parse_items_prog_bar_display: bool = False): 131 """Run the scraper: 132 1. prescrape chores 133 2. get parsable items 134 3. parse items 135 4. store items 136 5. postscrape chores""" 137 try: 138 self.timer.start() 139 self.logger.info("Scrape started.") 140 self.prescrape_chores() 141 try: 142 parsable_items = self.get_parsable_items() 143 self.logger.info( 144 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 145 ) 146 except Exception: 147 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 148 else: 149 if parse_items_prog_bar_display: 150 self._parse_items_prog_bar(parsable_items) 151 else: 152 self._parse_items_no_prog_bar(parsable_items) 153 self.logger.info( 154 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 155 ) 156 except Exception: 157 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 158 self.postscrape_chores()
Scraper base class.
name: str
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
def
get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
50 def get_page( 51 self, url: str, method: str = "get", headers: dict[str, str] = {} 52 ) -> requests.Response: 53 """Request `url` and return the `requests.Response` object. 54 55 By default, the only header sent is a randomized user agent string. 56 57 This can be overridden by supplying a user agent in the `headers` param.""" 58 try: 59 return requests.request( 60 method, url, headers={"User-Agent": get_agent()} | headers 61 ) 62 except Exception as e: 63 time.sleep(1) 64 return requests.request( 65 method, url, headers={"User-Agent": get_agent()} | headers 66 )
Request url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
def
as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
68 def as_soup(self, response: requests.Response) -> BeautifulSoup: 69 """Returns the text content of `response` as a `BeautifulSoup` object.""" 70 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
def
get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
72 def get_soup( 73 self, url: str, method: str = "get", headers: dict[str, str] = {} 74 ) -> BeautifulSoup: 75 """Request `url` with `headers` and return `BeautifulSoup` object.""" 76 return self.as_soup(self.get_page(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
def
clean_string(self, text: str) -> str:
78 def clean_string(self, text: str) -> str: 79 """Strip `\\n\\r\\t` and whitespace from `text`.""" 80 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
def
get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
90 def get_parsable_items(self) -> list[ParsableItem]: 91 """Get relevant webpages and extract raw data that needs to be parsed. 92 93 e.g. first 10 results for an endpoint that returns json content 94 >>> return self.get_page(some_url).json()[:10]""" 95 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
def
parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
97 def parse_item(self, item: ParsableItem) -> Any: 98 """Parse `item` and return parsed data. 99 100 e.g. 101 >>> try: 102 >>> parsed = {} 103 >>> parsed["thing1"] = item["element"].split()[0] 104 >>> self.successes += 1 105 >>> return parsed 106 >>> except Exception: 107 >>> self.logger.exception("message") 108 >>> self.failures += 1 109 >>> return None""" 110 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
def
scrape(self, parse_items_prog_bar_display: bool = False):
130 def scrape(self, parse_items_prog_bar_display: bool = False): 131 """Run the scraper: 132 1. prescrape chores 133 2. get parsable items 134 3. parse items 135 4. store items 136 5. postscrape chores""" 137 try: 138 self.timer.start() 139 self.logger.info("Scrape started.") 140 self.prescrape_chores() 141 try: 142 parsable_items = self.get_parsable_items() 143 self.logger.info( 144 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 145 ) 146 except Exception: 147 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 148 else: 149 if parse_items_prog_bar_display: 150 self._parse_items_prog_bar(parsable_items) 151 else: 152 self._parse_items_no_prog_bar(parsable_items) 153 self.logger.info( 154 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 155 ) 156 except Exception: 157 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 158 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse items
- store items
- postscrape chores