gruel.gruel
1import inspect 2import time 3from typing import Any 4 5import loggi 6import requests 7from bs4 import BeautifulSoup, Tag 8from noiftimer import Timer 9from pathier import Pathier, Pathish 10from printbuddies import ProgBar 11from whosyouragent import get_agent 12 13ParsableItem = dict | str | Tag 14 15 16class Gruel: 17 """Scraper base class. 18 19 Classes subclassing `Gruel` need to implement the following methods: 20 21 * `get_parsable_items(self) -> list[Any]` 22 * `parse_item(self, item: Any)->Any` 23 * `store_item(self, item: Any)` 24 25 Calling the `scrape()` method will execute: 26 1. `self.prescrape_chores()` (does nothing unless overridden) 27 2. `self.get_parsable_items()` 28 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 29 4. `self.store_item()` for each successfully parsed item 30 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 31 32 When overriding `self.postscrape_chores`, it's recommended to either 33 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 34 Otherwise running a large number of scrapers can cause file handle limit issues.""" 35 36 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 37 """ 38 :params: 39 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 40 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 41 * `log_dir`: The directory this scraper's logs should be saved to. 42 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 43 """ 44 self._name = name 45 self._init_logger(log_dir) 46 self.timer = Timer() 47 self.success_count = 0 48 self.fail_count = 0 49 50 @property 51 def name(self) -> str: 52 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 53 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 54 55 def _init_logger(self, log_dir: Pathish | None): 56 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 57 self.logger = loggi.getLogger(self.name, log_dir) 58 59 def get_page( 60 self, url: str, method: str = "get", headers: dict[str, str] = {} 61 ) -> requests.Response: 62 """Request `url` and return the `requests.Response` object. 63 64 By default, the only header sent is a randomized user agent string. 65 66 This can be overridden by supplying a user agent in the `headers` param.""" 67 try: 68 return requests.request( 69 method, url, headers={"User-Agent": get_agent()} | headers 70 ) 71 except Exception as e: 72 time.sleep(1) 73 return requests.request( 74 method, url, headers={"User-Agent": get_agent()} | headers 75 ) 76 77 def as_soup(self, response: requests.Response) -> BeautifulSoup: 78 """Returns the text content of `response` as a `BeautifulSoup` object.""" 79 return BeautifulSoup(response.text, "html.parser") 80 81 def get_soup( 82 self, url: str, method: str = "get", headers: dict[str, str] = {} 83 ) -> BeautifulSoup: 84 """Request `url` with `headers` and return `BeautifulSoup` object.""" 85 return self.as_soup(self.get_page(url, method, headers)) 86 87 def clean_string(self, text: str) -> str: 88 """Strip `\\n\\r\\t` and whitespace from `text`.""" 89 return text.strip(" \n\t\r") 90 91 def prescrape_chores(self): 92 """Chores to do before scraping.""" 93 ... 94 95 def postscrape_chores(self): 96 """Chores to do after scraping.""" 97 loggi.close(self.logger) 98 99 def get_parsable_items(self) -> list[ParsableItem]: 100 """Get relevant webpages and extract raw data that needs to be parsed. 101 102 e.g. first 10 results for an endpoint that returns json content 103 >>> return self.get_page(some_url).json()[:10]""" 104 raise NotImplementedError 105 106 def parse_item(self, item: ParsableItem) -> Any: 107 """Parse `item` and return parsed data. 108 109 e.g. 110 >>> try: 111 >>> parsed = {} 112 >>> parsed["thing1"] = item["element"].split()[0] 113 >>> self.successes += 1 114 >>> return parsed 115 >>> except Exception: 116 >>> self.logger.exception("message") 117 >>> self.failures += 1 118 >>> return None""" 119 raise NotImplementedError 120 121 def store_item(self, item: Any): 122 """Store `item`.""" 123 raise NotImplementedError 124 125 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 126 for item in parsable_items: 127 parsed_item = self.parse_item(item) 128 if parsed_item: 129 self.store_item(parsed_item) 130 131 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 132 with ProgBar(len(parsable_items)) as bar: 133 for item in parsable_items: 134 parsed_item = self.parse_item(item) 135 if parsed_item: 136 self.store_item(parsed_item) 137 bar.display(f"{bar.runtime}") 138 139 def scrape(self, parse_items_prog_bar_display: bool = False): 140 """Run the scraper: 141 1. prescrape chores 142 2. get parsable items 143 3. parse items 144 4. store items 145 5. postscrape chores""" 146 try: 147 self.timer.start() 148 self.logger.info("Scrape started.") 149 self.prescrape_chores() 150 try: 151 parsable_items = self.get_parsable_items() 152 self.logger.info( 153 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 154 ) 155 except Exception: 156 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 157 else: 158 if parse_items_prog_bar_display: 159 self._parse_items_prog_bar(parsable_items) 160 else: 161 self._parse_items_no_prog_bar(parsable_items) 162 self.logger.info( 163 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 164 ) 165 except Exception: 166 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 167 self.postscrape_chores()
17class Gruel: 18 """Scraper base class. 19 20 Classes subclassing `Gruel` need to implement the following methods: 21 22 * `get_parsable_items(self) -> list[Any]` 23 * `parse_item(self, item: Any)->Any` 24 * `store_item(self, item: Any)` 25 26 Calling the `scrape()` method will execute: 27 1. `self.prescrape_chores()` (does nothing unless overridden) 28 2. `self.get_parsable_items()` 29 3. `self.parse_item()` for each item returned by `self.get_parsable_items()` 30 4. `self.store_item()` for each successfully parsed item 31 5. `self.postscrape_chores()` (only closes this instance's log file unless overridden) 32 33 When overriding `self.postscrape_chores`, it's recommended to either 34 call `super().postscrape_chores()` or make sure to call `self.log.close()`. 35 Otherwise running a large number of scrapers can cause file handle limit issues.""" 36 37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0 50 51 @property 52 def name(self) -> str: 53 """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.""" 54 return self._name or Pathier(inspect.getsourcefile(type(self))).stem # type: ignore 55 56 def _init_logger(self, log_dir: Pathish | None): 57 log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir) 58 self.logger = loggi.getLogger(self.name, log_dir) 59 60 def get_page( 61 self, url: str, method: str = "get", headers: dict[str, str] = {} 62 ) -> requests.Response: 63 """Request `url` and return the `requests.Response` object. 64 65 By default, the only header sent is a randomized user agent string. 66 67 This can be overridden by supplying a user agent in the `headers` param.""" 68 try: 69 return requests.request( 70 method, url, headers={"User-Agent": get_agent()} | headers 71 ) 72 except Exception as e: 73 time.sleep(1) 74 return requests.request( 75 method, url, headers={"User-Agent": get_agent()} | headers 76 ) 77 78 def as_soup(self, response: requests.Response) -> BeautifulSoup: 79 """Returns the text content of `response` as a `BeautifulSoup` object.""" 80 return BeautifulSoup(response.text, "html.parser") 81 82 def get_soup( 83 self, url: str, method: str = "get", headers: dict[str, str] = {} 84 ) -> BeautifulSoup: 85 """Request `url` with `headers` and return `BeautifulSoup` object.""" 86 return self.as_soup(self.get_page(url, method, headers)) 87 88 def clean_string(self, text: str) -> str: 89 """Strip `\\n\\r\\t` and whitespace from `text`.""" 90 return text.strip(" \n\t\r") 91 92 def prescrape_chores(self): 93 """Chores to do before scraping.""" 94 ... 95 96 def postscrape_chores(self): 97 """Chores to do after scraping.""" 98 loggi.close(self.logger) 99 100 def get_parsable_items(self) -> list[ParsableItem]: 101 """Get relevant webpages and extract raw data that needs to be parsed. 102 103 e.g. first 10 results for an endpoint that returns json content 104 >>> return self.get_page(some_url).json()[:10]""" 105 raise NotImplementedError 106 107 def parse_item(self, item: ParsableItem) -> Any: 108 """Parse `item` and return parsed data. 109 110 e.g. 111 >>> try: 112 >>> parsed = {} 113 >>> parsed["thing1"] = item["element"].split()[0] 114 >>> self.successes += 1 115 >>> return parsed 116 >>> except Exception: 117 >>> self.logger.exception("message") 118 >>> self.failures += 1 119 >>> return None""" 120 raise NotImplementedError 121 122 def store_item(self, item: Any): 123 """Store `item`.""" 124 raise NotImplementedError 125 126 def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]): 127 for item in parsable_items: 128 parsed_item = self.parse_item(item) 129 if parsed_item: 130 self.store_item(parsed_item) 131 132 def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]): 133 with ProgBar(len(parsable_items)) as bar: 134 for item in parsable_items: 135 parsed_item = self.parse_item(item) 136 if parsed_item: 137 self.store_item(parsed_item) 138 bar.display(f"{bar.runtime}") 139 140 def scrape(self, parse_items_prog_bar_display: bool = False): 141 """Run the scraper: 142 1. prescrape chores 143 2. get parsable items 144 3. parse items 145 4. store items 146 5. postscrape chores""" 147 try: 148 self.timer.start() 149 self.logger.info("Scrape started.") 150 self.prescrape_chores() 151 try: 152 parsable_items = self.get_parsable_items() 153 self.logger.info( 154 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 155 ) 156 except Exception: 157 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 158 else: 159 if parse_items_prog_bar_display: 160 self._parse_items_prog_bar(parsable_items) 161 else: 162 self._parse_items_no_prog_bar(parsable_items) 163 self.logger.info( 164 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 165 ) 166 except Exception: 167 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 168 self.postscrape_chores()
Scraper base class.
Classes subclassing Gruel
need to implement the following methods:
get_parsable_items(self) -> list[Any]
parse_item(self, item: Any)->Any
store_item(self, item: Any)
Calling the scrape()
method will execute:
self.prescrape_chores()
(does nothing unless overridden)self.get_parsable_items()
self.parse_item()
for each item returned byself.get_parsable_items()
self.store_item()
for each successfully parsed itemself.postscrape_chores()
(only closes this instance's log file unless overridden)
When overriding self.postscrape_chores
, it's recommended to either
call super().postscrape_chores()
or make sure to call self.log.close()
.
Otherwise running a large number of scrapers can cause file handle limit issues.
37 def __init__(self, name: str | None = None, log_dir: Pathish | None = None): 38 """ 39 :params: 40 * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in. 41 i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`. 42 * `log_dir`: The directory this scraper's logs should be saved to. 43 If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory. 44 """ 45 self._name = name 46 self._init_logger(log_dir) 47 self.timer = Timer() 48 self.success_count = 0 49 self.fail_count = 0
:params:
name
: The name of this scraper. IfNone
, the name will be the stem of the file this class/subclass was defined in. i.e. AGruel
subclass located in a file calledmyscraper.py
will have the name"myscraper"
.log_dir
: The directory this scraper's logs should be saved to. IfNone
, the logs will be written to a folder called"gruel_logs"
within the current working directory.
Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.
60 def get_page( 61 self, url: str, method: str = "get", headers: dict[str, str] = {} 62 ) -> requests.Response: 63 """Request `url` and return the `requests.Response` object. 64 65 By default, the only header sent is a randomized user agent string. 66 67 This can be overridden by supplying a user agent in the `headers` param.""" 68 try: 69 return requests.request( 70 method, url, headers={"User-Agent": get_agent()} | headers 71 ) 72 except Exception as e: 73 time.sleep(1) 74 return requests.request( 75 method, url, headers={"User-Agent": get_agent()} | headers 76 )
Request url
and return the requests.Response
object.
By default, the only header sent is a randomized user agent string.
This can be overridden by supplying a user agent in the headers
param.
78 def as_soup(self, response: requests.Response) -> BeautifulSoup: 79 """Returns the text content of `response` as a `BeautifulSoup` object.""" 80 return BeautifulSoup(response.text, "html.parser")
Returns the text content of response
as a BeautifulSoup
object.
82 def get_soup( 83 self, url: str, method: str = "get", headers: dict[str, str] = {} 84 ) -> BeautifulSoup: 85 """Request `url` with `headers` and return `BeautifulSoup` object.""" 86 return self.as_soup(self.get_page(url, method, headers))
Request url
with headers
and return BeautifulSoup
object.
88 def clean_string(self, text: str) -> str: 89 """Strip `\\n\\r\\t` and whitespace from `text`.""" 90 return text.strip(" \n\t\r")
Strip \n\r\t
and whitespace from text
.
100 def get_parsable_items(self) -> list[ParsableItem]: 101 """Get relevant webpages and extract raw data that needs to be parsed. 102 103 e.g. first 10 results for an endpoint that returns json content 104 >>> return self.get_page(some_url).json()[:10]""" 105 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
107 def parse_item(self, item: ParsableItem) -> Any: 108 """Parse `item` and return parsed data. 109 110 e.g. 111 >>> try: 112 >>> parsed = {} 113 >>> parsed["thing1"] = item["element"].split()[0] 114 >>> self.successes += 1 115 >>> return parsed 116 >>> except Exception: 117 >>> self.logger.exception("message") 118 >>> self.failures += 1 119 >>> return None""" 120 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None
140 def scrape(self, parse_items_prog_bar_display: bool = False): 141 """Run the scraper: 142 1. prescrape chores 143 2. get parsable items 144 3. parse items 145 4. store items 146 5. postscrape chores""" 147 try: 148 self.timer.start() 149 self.logger.info("Scrape started.") 150 self.prescrape_chores() 151 try: 152 parsable_items = self.get_parsable_items() 153 self.logger.info( 154 f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items" 155 ) 156 except Exception: 157 self.logger.exception(f"Error in {self.name}:get_parsable_items().") 158 else: 159 if parse_items_prog_bar_display: 160 self._parse_items_prog_bar(parsable_items) 161 else: 162 self._parse_items_no_prog_bar(parsable_items) 163 self.logger.info( 164 f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures." 165 ) 166 except Exception: 167 self.logger.exception(f"Unexpected failure in {self.name}:scrape()") 168 self.postscrape_chores()
Run the scraper:
- prescrape chores
- get parsable items
- parse items
- store items
- postscrape chores