gruel.gruel
1import inspect 2import time 3from typing import Any 4 5import loggi 6import requests 7import whosyouragent 8from bs4 import BeautifulSoup, Tag 9from noiftimer import Timer 10from pathier import Pathier, Pathish 11from printbuddies import ProgBar 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class. 18 19 Classes subclassing `Gruel` need to implement the following methods: 20 21 * `get_parsable_items(self) -> list[Any]` 22 * `parse_item(self, item: Any)->Any` 23 * `store_item(self, item: Any)` 24 25 Calling the `scrape()` method will execute: 26 1. `self.prescrape_chores()` (does nothing unless overridden) 27 2. `self.get_parsable_items()` 28 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 29 4. `self.store_item()` for each successfully parsed item 30 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 31 32 When overriding `self.postscrape_chores`, it's recommended to either 33 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 34 Otherwise running a large number of scrapers can cause file handle limit issues.""" 35 36 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 37 """ 38 :params: 39 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 40 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 41 * `log_dir`: The directory this scraper's logs should be saved to. 42 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 43 """ 44 self._name = name 45 self._init_logger(log_dir) 46 self.timer = Timer() 47 self.success_count = 0 48 self.fail_count = 0 49 50 @property 51 def name(self) -> str: 52 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 53 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 54 55 def _init_logger(self, log_dir: Pathish | None): 56 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 57 self.logger = loggi.getLogger(self.name, log_dir) 58 59 def get_page( 60 self, 61 url: str, 62 method: str = "get", 63 headers: dict[str, str] = {}, 64 timeout: int | None = None, 65 retry_on_fail: bool = True, 66 params: dict | None = None, 67 data: dict | None = None, 68 ) -> requests.Response: 69 """Request `url` and return the `requests.Response` object. 70 71 By default, the only header sent is a randomized user agent string. 72 73 This can be overridden by supplying a user agent in the `headers` param.""" 74 args = [method, url] 75 headers = whosyouragent.get_header() | headers 76 kwargs = { 77 "headers": headers, 78 "timeout": timeout, 79 "params": params, 80 "data": data, 81 } 82 try: 83 response = requests.request(*args, **kwargs) 84 return response 85 except Exception as e: 86 if retry_on_fail: 87 time.sleep(1) 88 return requests.request(*args, **kwargs) 89 else: 90 raise e 91 92 def as_soup(self, response: requests.Response) -> BeautifulSoup: 93 """Returns the text content of `response` as a `BeautifulSoup` object.""" 94 return BeautifulSoup(response.text, "html.parser") 95 96 def get_soup( 97 self, url: str, method: str = "get", headers: dict[str, str] = {} 98 ) -> BeautifulSoup: 99 """Request `url` with `headers` and return `BeautifulSoup` object.""" 100 return self.as_soup(self.get_page(url, method, headers)) 101 102 def clean_string(self, text: str) -> str: 103 """Strip `\\n\\r\\t` and whitespace from `text`.""" 104 return text.strip(" \n\t\r") 105 106 def prescrape_chores(self): 107 """Chores to do before scraping.""" 108 ... 109 110 def postscrape_chores(self): 111 """Chores to do after scraping.""" 112 loggi.close(self.logger) 113 114 def get_parsable_items(self) -> list[ParsableItem]: 115 """Get relevant webpages and extract raw data that needs to be parsed. 116 117 e.g. first 10 results for an endpoint that returns json content 118 >>> return self.get_page(some_url).json()[:10]""" 119 raise NotImplementedError 120 121 def parse_item(self, item: ParsableItem) -> Any: 122 """Parse `item` and return parsed data. 123 124 e.g. 125 >>> try: 126 >>> parsed = {} 127 >>> parsed["thing1"] = item["element"].split()[0] 128 >>> self.successes += 1 129 >>> return parsed 130 >>> except Exception: 131 >>> self.logger.exception("message") 132 >>> self.failures += 1 133 >>> return None""" 134 raise NotImplementedError 135 136 def store_item(self, item: Any): 137 """Store `item`.""" 138 raise NotImplementedError 139 140 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 141 for item in parsable_items: 142 parsed_item = self.parse_item(item) 143 if parsed_item: 144 self.store_item(parsed_item) 145 146 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 147 with ProgBar(len(parsable_items)) as bar: 148 for item in parsable_items: 149 parsed_item = self.parse_item(item) 150 if parsed_item: 151 self.store_item(parsed_item) 152 bar.display(f"{bar.runtime}") 153 154 def scrape(self, parse_items_prog_bar_display: bool = False): 155 """Run the scraper: 156 1. prescrape chores 157 2. get parsable items 158 3. parse items 159 4. store items 160 5. postscrape chores""" 161 try: 162 self.timer.start() 163 self.logger.info("Scrape started.") 164 self.prescrape_chores() 165 try: 166 parsable_items = self.get_parsable_items() 167 self.logger.info( 168 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 169 ) 170 except Exception: 171 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 172 else: 173 if parse_items_prog_bar_display: 174 self._parse_items_prog_bar(parsable_items) 175 else: 176 self._parse_items_no_prog_bar(parsable_items) 177 self.logger.info( 178 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 179 ) 180 except Exception: 181 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 182 self.postscrape_chores()
17class Gruel: 18 """Scraper base class. 19 20 Classes subclassing `Gruel` need to implement the following methods: 21 22 * `get_parsable_items(self) -> list[Any]` 23 * `parse_item(self, item: Any)->Any` 24 * `store_item(self, item: Any)` 25 26 Calling the `scrape()` method will execute: 27 1. `self.prescrape_chores()` (does nothing unless overridden) 28 2. `self.get_parsable_items()` 29 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 30 4. `self.store_item()` for each successfully parsed item 31 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 32 33 When overriding `self.postscrape_chores`, it's recommended to either 34 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 35 Otherwise running a large number of scrapers can cause file handle limit issues.""" 36 37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0 50 51 @property 52 def name(self) -> str: 53 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 54 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 55 56 def _init_logger(self, log_dir: Pathish | None): 57 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 58 self.logger = loggi.getLogger(self.name, log_dir) 59 60 def get_page( 61 self, 62 url: str, 63 method: str = "get", 64 headers: dict[str, str] = {}, 65 timeout: int | None = None, 66 retry_on_fail: bool = True, 67 params: dict | None = None, 68 data: dict | None = None, 69 ) -> requests.Response: 70 """Request `url` and return the `requests.Response` object. 71 72 By default, the only header sent is a randomized user agent string. 73 74 This can be overridden by supplying a user agent in the `headers` param.""" 75 args = [method, url] 76 headers = whosyouragent.get_header() | headers 77 kwargs = { 78 "headers": headers, 79 "timeout": timeout, 80 "params": params, 81 "data": data, 82 } 83 try: 84 response = requests.request(*args, **kwargs) 85 return response 86 except Exception as e: 87 if retry_on_fail: 88 time.sleep(1) 89 return requests.request(*args, **kwargs) 90 else: 91 raise e 92 93 def as_soup(self, response: requests.Response) -> BeautifulSoup: 94 """Returns the text content of `response` as a `BeautifulSoup` object.""" 95 return BeautifulSoup(response.text, "html.parser") 96 97 def get_soup( 98 self, url: str, method: str = "get", headers: dict[str, str] = {} 99 ) -> BeautifulSoup: 100 """Request `url` with `headers` and return `BeautifulSoup` object.""" 101 return self.as_soup(self.get_page(url, method, headers)) 102 103 def clean_string(self, text: str) -> str: 104 """Strip `\\n\\r\\t` and whitespace from `text`.""" 105 return text.strip(" \n\t\r") 106 107 def prescrape_chores(self): 108 """Chores to do before scraping.""" 109 ... 110 111 def postscrape_chores(self): 112 """Chores to do after scraping.""" 113 loggi.close(self.logger) 114 115 def get_parsable_items(self) -> list[ParsableItem]: 116 """Get relevant webpages and extract raw data that needs to be parsed. 117 118 e.g. first 10 results for an endpoint that returns json content 119 >>> return self.get_page(some_url).json()[:10]""" 120 raise NotImplementedError 121 122 def parse_item(self, item: ParsableItem) -> Any: 123 """Parse `item` and return parsed data. 124 125 e.g. 126 >>> try: 127 >>> parsed = {} 128 >>> parsed["thing1"] = item["element"].split()[0] 129 >>> self.successes += 1 130 >>> return parsed 131 >>> except Exception: 132 >>> self.logger.exception("message") 133 >>> self.failures += 1 134 >>> return None""" 135 raise NotImplementedError 136 137 def store_item(self, item: Any): 138 """Store `item`.""" 139 raise NotImplementedError 140 141 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 142 for item in parsable_items: 143 parsed_item = self.parse_item(item) 144 if parsed_item: 145 self.store_item(parsed_item) 146 147 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 148 with ProgBar(len(parsable_items)) as bar: 149 for item in parsable_items: 150 parsed_item = self.parse_item(item) 151 if parsed_item: 152 self.store_item(parsed_item) 153 bar.display(f"{bar.runtime}") 154 155 def scrape(self, parse_items_prog_bar_display: bool = False): 156 """Run the scraper: 157 1. prescrape chores 158 2. get parsable items 159 3. parse items 160 4. store items 161 5. postscrape chores""" 162 try: 163 self.timer.start() 164 self.logger.info("Scrape started.") 165 self.prescrape_chores() 166 try: 167 parsable_items = self.get_parsable_items() 168 self.logger.info( 169 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 170 ) 171 except Exception: 172 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 173 else: 174 if parse_items_prog_bar_display: 175 self._parse_items_prog_bar(parsable_items) 176 else: 177 self._parse_items_no_prog_bar(parsable_items) 178 self.logger.info( 179 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 180 ) 181 except Exception: 182 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 183 self.postscrape_chores()
Scraper base class.
Classes subclassing Gruel
need to implement the following methods:
get_parsable_items(self) -> list[Any]
parse_item(self, item: Any)->Any
store_item(self, item: Any)
Calling the scrape()
method will execute:
self.prescrape_chores()
(does nothing unless overridden)self.get_parsable_items()
self.parse_item()
for each item returned byself.get_parsable_items()
self.store_item()
for each successfully parsed itemself.postscrape_chores()
(only closes this instance's log file unless overridden)
When overriding self.postscrape_chores
, it's recommended to either
call super().postscrape_chores()
or make sure to call self.log.close()
.
Otherwise running a large number of scrapers can cause file handle limit issues.
37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0
:params:
name
: The name of this scraper. IfNone
, the name will be the stem of the file this class/subclass was defined in. i.e. AGruel
subclass located in a file calledmyscraper.py
will have the name"myscraper"
.log_dir
: The directory this scraper's logs should be saved to. IfNone
, the logs will be written to a folder called"gruel_logs"
within the current working directory.
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
60 def get_page( 61 self, 62 url: str, 63 method: str = "get", 64 headers: dict[str, str] = {}, 65 timeout: int | None = None, 66 retry_on_fail: bool = True, 67 params: dict | None = None, 68 data: dict | None = None, 69 ) -> requests.Response: 70 """Request `url` and return the `requests.Response` object. 71 72 By default, the only header sent is a randomized user agent string. 73 74 This can be overridden by supplying a user agent in the `headers` param.""" 75 args = [method, url] 76 headers = whosyouragent.get_header() | headers 77 kwargs = { 78 "headers": headers, 79 "timeout": timeout, 80 "params": params, 81 "data": data, 82 } 83 try: 84 response = requests.request(*args, **kwargs) 85 return response 86 except Exception as e: 87 if retry_on_fail: 88 time.sleep(1) 89 return requests.request(*args, **kwargs) 90 else: 91 raise e
Request url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
93 def as_soup(self, response: requests.Response) -> BeautifulSoup: 94 """Returns the text content of `response` as a `BeautifulSoup` object.""" 95 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
97 def get_soup( 98 self, url: str, method: str = "get", headers: dict[str, str] = {} 99 ) -> BeautifulSoup: 100 """Request `url` with `headers` and return `BeautifulSoup` object.""" 101 return self.as_soup(self.get_page(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
103 def clean_string(self, text: str) -> str: 104 """Strip `\\n\\r\\t` and whitespace from `text`.""" 105 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
111 def postscrape_chores(self): 112 """Chores to do after scraping.""" 113 loggi.close(self.logger)
Chores to do after scraping.
115 def get_parsable_items(self) -> list[ParsableItem]: 116 """Get relevant webpages and extract raw data that needs to be parsed. 117 118 e.g. first 10 results for an endpoint that returns json content 119 >>> return self.get_page(some_url).json()[:10]""" 120 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
122 def parse_item(self, item: ParsableItem) -> Any: 123 """Parse `item` and return parsed data. 124 125 e.g. 126 >>> try: 127 >>> parsed = {} 128 >>> parsed["thing1"] = item["element"].split()[0] 129 >>> self.successes += 1 130 >>> return parsed 131 >>> except Exception: 132 >>> self.logger.exception("message") 133 >>> self.failures += 1 134 >>> return None""" 135 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
155 def scrape(self, parse_items_prog_bar_display: bool = False): 156 """Run the scraper: 157 1. prescrape chores 158 2. get parsable items 159 3. parse items 160 4. store items 161 5. postscrape chores""" 162 try: 163 self.timer.start() 164 self.logger.info("Scrape started.") 165 self.prescrape_chores() 166 try: 167 parsable_items = self.get_parsable_items() 168 self.logger.info( 169 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 170 ) 171 except Exception: 172 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 173 else: 174 if parse_items_prog_bar_display: 175 self._parse_items_prog_bar(parsable_items) 176 else: 177 self._parse_items_no_prog_bar(parsable_items) 178 self.logger.info( 179 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 180 ) 181 except Exception: 182 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 183 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse items
- store items
- postscrape chores