gruel.grueler
1import inspect 2import time 3from typing import Any 4 5import loggi 6import requests 7import whosyouragent 8from bs4 import BeautifulSoup, Tag 9from noiftimer import Timer 10from pathier import Pathier, Pathish 11from printbuddies import ProgBar 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class. 18 19 Classes subclassing `Gruel` need to implement the following methods: 20 21 * `get_parsable_items(self) -> list[Any]` 22 * `parse_item(self, item: Any)->Any` 23 * `store_item(self, item: Any)` 24 25 Calling the `scrape()` method will execute: 26 1. `self.prescrape_chores()` (does nothing unless overridden) 27 2. `self.get_parsable_items()` 28 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 29 4. `self.store_item()` for each successfully parsed item 30 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 31 32 When overriding `self.postscrape_chores`, it's recommended to either 33 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 34 Otherwise running a large number of scrapers can cause file handle limit issues.""" 35 36 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 37 """ 38 :params: 39 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 40 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 41 * `log_dir`: The directory this scraper's logs should be saved to. 42 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 43 """ 44 self._name = name 45 self._init_logger(log_dir) 46 self.timer = Timer() 47 self.success_count = 0 48 self.fail_count = 0 49 self.failed_to_get_parsable_items = False 50 self.unexpected_failure_occured = False 51 self.parsable_items = [] 52 self.parsed_items = [] 53 54 @property 55 def name(self) -> str: 56 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 57 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 58 59 @property 60 def had_failures(self) -> bool: 61 """`True` if getting parsable items, parsing items, or unexpected failures occured.""" 62 return ( 63 (self.fail_count > 0) 64 or self.failed_to_get_parsable_items 65 or self.unexpected_failure_occured 66 ) 67 68 def _init_logger(self, log_dir: Pathish | None): 69 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 70 self.logger = loggi.getLogger(self.name, log_dir) 71 72 @staticmethod 73 def request( 74 url: str, 75 method: str = "get", 76 headers: dict[str, str] = {}, 77 params: dict | None = None, 78 data: dict | None = None, 79 timeout: int | None = None, 80 retry_on_fail: bool = True, 81 ) -> requests.Response: 82 """Send a request to `url` and return the `requests.Response` object. 83 84 By default, the only header sent is a randomized user agent string. 85 86 This can be overridden by supplying a user agent in the `headers` param. 87 88 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 89 Otherwise, the exception will be raised.""" 90 args = [method, url] 91 headers = whosyouragent.get_header() | headers 92 kwargs = { 93 "headers": headers, 94 "timeout": timeout, 95 "params": params, 96 "data": data, 97 } 98 try: 99 response = requests.request(*args, **kwargs) 100 return response 101 except Exception as e: 102 if retry_on_fail: 103 time.sleep(1) 104 return requests.request(*args, **kwargs) 105 else: 106 raise e 107 108 @staticmethod 109 def as_soup(response: requests.Response) -> BeautifulSoup: 110 """Returns the text content of `response` as a `BeautifulSoup` object.""" 111 return BeautifulSoup(response.text, "html.parser") 112 113 def get_soup( 114 self, url: str, method: str = "get", headers: dict[str, str] = {} 115 ) -> BeautifulSoup: 116 """Request `url` with `headers` and return `BeautifulSoup` object.""" 117 return self.as_soup(self.request(url, method, headers)) 118 119 def clean_string(self, text: str) -> str: 120 """Strip `\\n\\r\\t` and whitespace from `text`.""" 121 return text.strip(" \n\t\r") 122 123 # |==============================================================================| 124 # Overridables 125 # |==============================================================================| 126 def prescrape_chores(self): 127 """Chores to do before scraping.""" 128 ... 129 130 def postscrape_chores(self): 131 """Chores to do after scraping.""" 132 loggi.close(self.logger) 133 134 def get_parsable_items(self) -> list[ParsableItem]: 135 """Get relevant webpages and extract raw data that needs to be parsed. 136 137 e.g. first 10 results for an endpoint that returns json content 138 >>> return self.get_page(some_url).json()[:10]""" 139 raise NotImplementedError 140 141 def parse_item(self, item: ParsableItem) -> Any: 142 """Parse `item` and return parsed data. 143 144 e.g. 145 >>> try: 146 >>> parsed = {} 147 >>> parsed["thing1"] = item["element"].split()[0] 148 >>> self.successes += 1 149 >>> return parsed 150 >>> except Exception: 151 >>> self.logger.exception("message") 152 >>> self.failures += 1 153 >>> return None""" 154 raise NotImplementedError 155 156 def store_item(self, item: Any): 157 """Store `item`.""" 158 raise NotImplementedError 159 160 def _parse_items_no_prog_bar(self): 161 for item in self.parsable_items: 162 parsed_item = self.parse_item(item) 163 if parsed_item: 164 self.store_item(parsed_item) 165 # Append to `self.parsable_items` even if `None` 166 # so `parsable_items` and `parsed_items` are equal length 167 self.parsed_items.append(parsed_item) 168 169 def _parse_items_prog_bar(self): 170 with ProgBar(len(self.parsable_items)) as bar: 171 for item in self.parsable_items: 172 parsed_item = self.parse_item(item) 173 if parsed_item: 174 self.store_item(parsed_item) 175 bar.display(f"{bar.runtime}") 176 # Append to `self.parsable_items` even if `None` 177 # so `parsable_items` and `parsed_items` are equal length 178 self.parsed_items.append(parsed_item) 179 180 def scrape(self, parse_items_prog_bar_display: bool = False): 181 """Run the scraper: 182 1. prescrape chores 183 2. get parsable items 184 3. parse and store items 185 5. postscrape chores""" 186 try: 187 self.timer.start() 188 self.logger.info("Scrape started.") 189 self.prescrape_chores() 190 try: 191 self.parsable_items = self.get_parsable_items() 192 self.logger.info( 193 f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items" 194 ) 195 except Exception: 196 self.failed_to_get_parsable_items = True 197 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 198 else: 199 if parse_items_prog_bar_display: 200 self._parse_items_prog_bar() 201 else: 202 self._parse_items_no_prog_bar() 203 self.logger.info( 204 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 205 ) 206 except Exception: 207 self.unexpected_failure_occured = True 208 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 209 self.postscrape_chores()
17class Gruel: 18 """Scraper base class. 19 20 Classes subclassing `Gruel` need to implement the following methods: 21 22 * `get_parsable_items(self) -> list[Any]` 23 * `parse_item(self, item: Any)->Any` 24 * `store_item(self, item: Any)` 25 26 Calling the `scrape()` method will execute: 27 1. `self.prescrape_chores()` (does nothing unless overridden) 28 2. `self.get_parsable_items()` 29 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 30 4. `self.store_item()` for each successfully parsed item 31 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 32 33 When overriding `self.postscrape_chores`, it's recommended to either 34 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 35 Otherwise running a large number of scrapers can cause file handle limit issues.""" 36 37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0 50 self.failed_to_get_parsable_items = False 51 self.unexpected_failure_occured = False 52 self.parsable_items = [] 53 self.parsed_items = [] 54 55 @property 56 def name(self) -> str: 57 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 58 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 59 60 @property 61 def had_failures(self) -> bool: 62 """`True` if getting parsable items, parsing items, or unexpected failures occured.""" 63 return ( 64 (self.fail_count > 0) 65 or self.failed_to_get_parsable_items 66 or self.unexpected_failure_occured 67 ) 68 69 def _init_logger(self, log_dir: Pathish | None): 70 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 71 self.logger = loggi.getLogger(self.name, log_dir) 72 73 @staticmethod 74 def request( 75 url: str, 76 method: str = "get", 77 headers: dict[str, str] = {}, 78 params: dict | None = None, 79 data: dict | None = None, 80 timeout: int | None = None, 81 retry_on_fail: bool = True, 82 ) -> requests.Response: 83 """Send a request to `url` and return the `requests.Response` object. 84 85 By default, the only header sent is a randomized user agent string. 86 87 This can be overridden by supplying a user agent in the `headers` param. 88 89 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 90 Otherwise, the exception will be raised.""" 91 args = [method, url] 92 headers = whosyouragent.get_header() | headers 93 kwargs = { 94 "headers": headers, 95 "timeout": timeout, 96 "params": params, 97 "data": data, 98 } 99 try: 100 response = requests.request(*args, **kwargs) 101 return response 102 except Exception as e: 103 if retry_on_fail: 104 time.sleep(1) 105 return requests.request(*args, **kwargs) 106 else: 107 raise e 108 109 @staticmethod 110 def as_soup(response: requests.Response) -> BeautifulSoup: 111 """Returns the text content of `response` as a `BeautifulSoup` object.""" 112 return BeautifulSoup(response.text, "html.parser") 113 114 def get_soup( 115 self, url: str, method: str = "get", headers: dict[str, str] = {} 116 ) -> BeautifulSoup: 117 """Request `url` with `headers` and return `BeautifulSoup` object.""" 118 return self.as_soup(self.request(url, method, headers)) 119 120 def clean_string(self, text: str) -> str: 121 """Strip `\\n\\r\\t` and whitespace from `text`.""" 122 return text.strip(" \n\t\r") 123 124 # |==============================================================================| 125 # Overridables 126 # |==============================================================================| 127 def prescrape_chores(self): 128 """Chores to do before scraping.""" 129 ... 130 131 def postscrape_chores(self): 132 """Chores to do after scraping.""" 133 loggi.close(self.logger) 134 135 def get_parsable_items(self) -> list[ParsableItem]: 136 """Get relevant webpages and extract raw data that needs to be parsed. 137 138 e.g. first 10 results for an endpoint that returns json content 139 >>> return self.get_page(some_url).json()[:10]""" 140 raise NotImplementedError 141 142 def parse_item(self, item: ParsableItem) -> Any: 143 """Parse `item` and return parsed data. 144 145 e.g. 146 >>> try: 147 >>> parsed = {} 148 >>> parsed["thing1"] = item["element"].split()[0] 149 >>> self.successes += 1 150 >>> return parsed 151 >>> except Exception: 152 >>> self.logger.exception("message") 153 >>> self.failures += 1 154 >>> return None""" 155 raise NotImplementedError 156 157 def store_item(self, item: Any): 158 """Store `item`.""" 159 raise NotImplementedError 160 161 def _parse_items_no_prog_bar(self): 162 for item in self.parsable_items: 163 parsed_item = self.parse_item(item) 164 if parsed_item: 165 self.store_item(parsed_item) 166 # Append to `self.parsable_items` even if `None` 167 # so `parsable_items` and `parsed_items` are equal length 168 self.parsed_items.append(parsed_item) 169 170 def _parse_items_prog_bar(self): 171 with ProgBar(len(self.parsable_items)) as bar: 172 for item in self.parsable_items: 173 parsed_item = self.parse_item(item) 174 if parsed_item: 175 self.store_item(parsed_item) 176 bar.display(f"{bar.runtime}") 177 # Append to `self.parsable_items` even if `None` 178 # so `parsable_items` and `parsed_items` are equal length 179 self.parsed_items.append(parsed_item) 180 181 def scrape(self, parse_items_prog_bar_display: bool = False): 182 """Run the scraper: 183 1. prescrape chores 184 2. get parsable items 185 3. parse and store items 186 5. postscrape chores""" 187 try: 188 self.timer.start() 189 self.logger.info("Scrape started.") 190 self.prescrape_chores() 191 try: 192 self.parsable_items = self.get_parsable_items() 193 self.logger.info( 194 f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items" 195 ) 196 except Exception: 197 self.failed_to_get_parsable_items = True 198 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 199 else: 200 if parse_items_prog_bar_display: 201 self._parse_items_prog_bar() 202 else: 203 self._parse_items_no_prog_bar() 204 self.logger.info( 205 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 206 ) 207 except Exception: 208 self.unexpected_failure_occured = True 209 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 210 self.postscrape_chores()
Scraper base class.
Classes subclassing Gruel
need to implement the following methods:
get_parsable_items(self) -> list[Any]
parse_item(self, item: Any)->Any
store_item(self, item: Any)
Calling the scrape()
method will execute:
self.prescrape_chores()
(does nothing unless overridden)self.get_parsable_items()
self.parse_item()
for each item returned byself.get_parsable_items()
self.store_item()
for each successfully parsed itemself.postscrape_chores()
(only closes this instance's log file unless overridden)
When overriding self.postscrape_chores
, it's recommended to either
call super().postscrape_chores()
or make sure to call self.log.close()
.
Otherwise running a large number of scrapers can cause file handle limit issues.
37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0 50 self.failed_to_get_parsable_items = False 51 self.unexpected_failure_occured = False 52 self.parsable_items = [] 53 self.parsed_items = []
:params:
name
: The name of this scraper. IfNone
, the name will be the stem of the file this class/subclass was defined in. i.e. AGruel
subclass located in a file calledmyscraper.py
will have the name"myscraper"
.log_dir
: The directory this scraper's logs should be saved to. IfNone
, the logs will be written to a folder called"gruel_logs"
within the current working directory.
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
73 @staticmethod 74 def request( 75 url: str, 76 method: str = "get", 77 headers: dict[str, str] = {}, 78 params: dict | None = None, 79 data: dict | None = None, 80 timeout: int | None = None, 81 retry_on_fail: bool = True, 82 ) -> requests.Response: 83 """Send a request to `url` and return the `requests.Response` object. 84 85 By default, the only header sent is a randomized user agent string. 86 87 This can be overridden by supplying a user agent in the `headers` param. 88 89 If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown. 90 Otherwise, the exception will be raised.""" 91 args = [method, url] 92 headers = whosyouragent.get_header() | headers 93 kwargs = { 94 "headers": headers, 95 "timeout": timeout, 96 "params": params, 97 "data": data, 98 } 99 try: 100 response = requests.request(*args, **kwargs) 101 return response 102 except Exception as e: 103 if retry_on_fail: 104 time.sleep(1) 105 return requests.request(*args, **kwargs) 106 else: 107 raise e
Send a request to url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
If retry_on_fail
is True
, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
Otherwise, the exception will be raised.
109 @staticmethod 110 def as_soup(response: requests.Response) -> BeautifulSoup: 111 """Returns the text content of `response` as a `BeautifulSoup` object.""" 112 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
114 def get_soup( 115 self, url: str, method: str = "get", headers: dict[str, str] = {} 116 ) -> BeautifulSoup: 117 """Request `url` with `headers` and return `BeautifulSoup` object.""" 118 return self.as_soup(self.request(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
120 def clean_string(self, text: str) -> str: 121 """Strip `\\n\\r\\t` and whitespace from `text`.""" 122 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
131 def postscrape_chores(self): 132 """Chores to do after scraping.""" 133 loggi.close(self.logger)
Chores to do after scraping.
135 def get_parsable_items(self) -> list[ParsableItem]: 136 """Get relevant webpages and extract raw data that needs to be parsed. 137 138 e.g. first 10 results for an endpoint that returns json content 139 >>> return self.get_page(some_url).json()[:10]""" 140 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
142 def parse_item(self, item: ParsableItem) -> Any: 143 """Parse `item` and return parsed data. 144 145 e.g. 146 >>> try: 147 >>> parsed = {} 148 >>> parsed["thing1"] = item["element"].split()[0] 149 >>> self.successes += 1 150 >>> return parsed 151 >>> except Exception: 152 >>> self.logger.exception("message") 153 >>> self.failures += 1 154 >>> return None""" 155 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
181 def scrape(self, parse_items_prog_bar_display: bool = False): 182 """Run the scraper: 183 1. prescrape chores 184 2. get parsable items 185 3. parse and store items 186 5. postscrape chores""" 187 try: 188 self.timer.start() 189 self.logger.info("Scrape started.") 190 self.prescrape_chores() 191 try: 192 self.parsable_items = self.get_parsable_items() 193 self.logger.info( 194 f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items" 195 ) 196 except Exception: 197 self.failed_to_get_parsable_items = True 198 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 199 else: 200 if parse_items_prog_bar_display: 201 self._parse_items_prog_bar() 202 else: 203 self._parse_items_no_prog_bar() 204 self.logger.info( 205 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 206 ) 207 except Exception: 208 self.unexpected_failure_occured = True 209 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 210 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse and store items
- postscrape chores