gruel.grueler

  1import inspect
  2import time
  3from typing import Any
  4
  5import loggi
  6import requests
  7import whosyouragent
  8from bs4 import BeautifulSoup, Tag
  9from noiftimer import Timer
 10from pathier import Pathier, Pathish
 11from printbuddies import ProgBar
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class.
 18
 19    Classes subclassing `Gruel` need to implement the following methods:
 20
 21    * `get_parsable_items(self) -> list[Any]`
 22    * `parse_item(self, item: Any)->Any`
 23    * `store_item(self, item: Any)`
 24
 25    Calling the `scrape()` method will execute:
 26    1. `self.prescrape_chores()` (does nothing unless overridden)
 27    2. `self.get_parsable_items()`
 28    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 29    4. `self.store_item()` for each successfully parsed item
 30    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 31
 32    When overriding `self.postscrape_chores`, it's recommended to either
 33    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 34    Otherwise running a large number of scrapers can cause file handle limit issues."""
 35
 36    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 37        """
 38        :params:
 39        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 40        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 41        * `log_dir`: The directory this scraper's logs should be saved to.
 42        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 43        """
 44        self._name = name
 45        self._init_logger(log_dir)
 46        self.timer = Timer()
 47        self.success_count = 0
 48        self.fail_count = 0
 49        self.failed_to_get_parsable_items = False
 50        self.unexpected_failure_occured = False
 51        self.parsable_items = []
 52        self.parsed_items = []
 53
 54    @property
 55    def name(self) -> str:
 56        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 57        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 58
 59    @property
 60    def had_failures(self) -> bool:
 61        """`True` if getting parsable items, parsing items, or unexpected failures occured."""
 62        return (
 63            (self.fail_count > 0)
 64            or self.failed_to_get_parsable_items
 65            or self.unexpected_failure_occured
 66        )
 67
 68    def _init_logger(self, log_dir: Pathish | None):
 69        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 70        self.logger = loggi.getLogger(self.name, log_dir)
 71
 72    @staticmethod
 73    def request(
 74        url: str,
 75        method: str = "get",
 76        headers: dict[str, str] = {},
 77        params: dict | None = None,
 78        data: dict | None = None,
 79        timeout: int | None = None,
 80        retry_on_fail: bool = True,
 81    ) -> requests.Response:
 82        """Send a request to `url` and return the `requests.Response` object.
 83
 84        By default, the only header sent is a randomized user agent string.
 85
 86        This can be overridden by supplying a user agent in the `headers` param.
 87
 88        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 89        Otherwise, the exception will be raised."""
 90        args = [method, url]
 91        headers = whosyouragent.get_header() | headers
 92        kwargs = {
 93            "headers": headers,
 94            "timeout": timeout,
 95            "params": params,
 96            "data": data,
 97        }
 98        try:
 99            response = requests.request(*args, **kwargs)
100            return response
101        except Exception as e:
102            if retry_on_fail:
103                time.sleep(1)
104                return requests.request(*args, **kwargs)
105            else:
106                raise e
107
108    @staticmethod
109    def as_soup(response: requests.Response) -> BeautifulSoup:
110        """Returns the text content of `response` as a `BeautifulSoup` object."""
111        return BeautifulSoup(response.text, "html.parser")
112
113    def get_soup(
114        self, url: str, method: str = "get", headers: dict[str, str] = {}
115    ) -> BeautifulSoup:
116        """Request `url` with `headers` and return `BeautifulSoup` object."""
117        return self.as_soup(self.request(url, method, headers))
118
119    def clean_string(self, text: str) -> str:
120        """Strip `\\n\\r\\t` and whitespace from `text`."""
121        return text.strip(" \n\t\r")
122
123    # |==============================================================================|
124    # Overridables
125    # |==============================================================================|
126    def prescrape_chores(self):
127        """Chores to do before scraping."""
128        ...
129
130    def postscrape_chores(self):
131        """Chores to do after scraping."""
132        loggi.close(self.logger)
133
134    def get_parsable_items(self) -> list[ParsableItem]:
135        """Get relevant webpages and extract raw data that needs to be parsed.
136
137        e.g. first 10 results for an endpoint that returns json content
138        >>> return self.get_page(some_url).json()[:10]"""
139        raise NotImplementedError
140
141    def parse_item(self, item: ParsableItem) -> Any:
142        """Parse `item` and return parsed data.
143
144        e.g.
145        >>> try:
146        >>>     parsed = {}
147        >>>     parsed["thing1"] = item["element"].split()[0]
148        >>>     self.successes += 1
149        >>>     return parsed
150        >>> except Exception:
151        >>>     self.logger.exception("message")
152        >>>     self.failures += 1
153        >>>     return None"""
154        raise NotImplementedError
155
156    def store_item(self, item: Any):
157        """Store `item`."""
158        raise NotImplementedError
159
160    def _parse_items_no_prog_bar(self):
161        for item in self.parsable_items:
162            parsed_item = self.parse_item(item)
163            if parsed_item:
164                self.store_item(parsed_item)
165            # Append to `self.parsable_items` even if `None`
166            # so `parsable_items` and `parsed_items` are equal length
167            self.parsed_items.append(parsed_item)
168
169    def _parse_items_prog_bar(self):
170        with ProgBar(len(self.parsable_items)) as bar:
171            for item in self.parsable_items:
172                parsed_item = self.parse_item(item)
173                if parsed_item:
174                    self.store_item(parsed_item)
175                    bar.display(f"{bar.runtime}")
176                # Append to `self.parsable_items` even if `None`
177                # so `parsable_items` and `parsed_items` are equal length
178                self.parsed_items.append(parsed_item)
179
180    def scrape(self, parse_items_prog_bar_display: bool = False):
181        """Run the scraper:
182        1. prescrape chores
183        2. get parsable items
184        3. parse and store items
185        5. postscrape chores"""
186        try:
187            self.timer.start()
188            self.logger.info("Scrape started.")
189            self.prescrape_chores()
190            try:
191                self.parsable_items = self.get_parsable_items()
192                self.logger.info(
193                    f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items"
194                )
195            except Exception:
196                self.failed_to_get_parsable_items = True
197                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
198            else:
199                if parse_items_prog_bar_display:
200                    self._parse_items_prog_bar()
201                else:
202                    self._parse_items_no_prog_bar()
203                self.logger.info(
204                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
205                )
206        except Exception:
207            self.unexpected_failure_occured = True
208            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
209        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class.
 19
 20    Classes subclassing `Gruel` need to implement the following methods:
 21
 22    * `get_parsable_items(self) -> list[Any]`
 23    * `parse_item(self, item: Any)->Any`
 24    * `store_item(self, item: Any)`
 25
 26    Calling the `scrape()` method will execute:
 27    1. `self.prescrape_chores()` (does nothing unless overridden)
 28    2. `self.get_parsable_items()`
 29    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 30    4. `self.store_item()` for each successfully parsed item
 31    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 32
 33    When overriding `self.postscrape_chores`, it's recommended to either
 34    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 35    Otherwise running a large number of scrapers can cause file handle limit issues."""
 36
 37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 38        """
 39        :params:
 40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 42        * `log_dir`: The directory this scraper's logs should be saved to.
 43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 44        """
 45        self._name = name
 46        self._init_logger(log_dir)
 47        self.timer = Timer()
 48        self.success_count = 0
 49        self.fail_count = 0
 50        self.failed_to_get_parsable_items = False
 51        self.unexpected_failure_occured = False
 52        self.parsable_items = []
 53        self.parsed_items = []
 54
 55    @property
 56    def name(self) -> str:
 57        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 58        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 59
 60    @property
 61    def had_failures(self) -> bool:
 62        """`True` if getting parsable items, parsing items, or unexpected failures occured."""
 63        return (
 64            (self.fail_count > 0)
 65            or self.failed_to_get_parsable_items
 66            or self.unexpected_failure_occured
 67        )
 68
 69    def _init_logger(self, log_dir: Pathish | None):
 70        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 71        self.logger = loggi.getLogger(self.name, log_dir)
 72
 73    @staticmethod
 74    def request(
 75        url: str,
 76        method: str = "get",
 77        headers: dict[str, str] = {},
 78        params: dict | None = None,
 79        data: dict | None = None,
 80        timeout: int | None = None,
 81        retry_on_fail: bool = True,
 82    ) -> requests.Response:
 83        """Send a request to `url` and return the `requests.Response` object.
 84
 85        By default, the only header sent is a randomized user agent string.
 86
 87        This can be overridden by supplying a user agent in the `headers` param.
 88
 89        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 90        Otherwise, the exception will be raised."""
 91        args = [method, url]
 92        headers = whosyouragent.get_header() | headers
 93        kwargs = {
 94            "headers": headers,
 95            "timeout": timeout,
 96            "params": params,
 97            "data": data,
 98        }
 99        try:
100            response = requests.request(*args, **kwargs)
101            return response
102        except Exception as e:
103            if retry_on_fail:
104                time.sleep(1)
105                return requests.request(*args, **kwargs)
106            else:
107                raise e
108
109    @staticmethod
110    def as_soup(response: requests.Response) -> BeautifulSoup:
111        """Returns the text content of `response` as a `BeautifulSoup` object."""
112        return BeautifulSoup(response.text, "html.parser")
113
114    def get_soup(
115        self, url: str, method: str = "get", headers: dict[str, str] = {}
116    ) -> BeautifulSoup:
117        """Request `url` with `headers` and return `BeautifulSoup` object."""
118        return self.as_soup(self.request(url, method, headers))
119
120    def clean_string(self, text: str) -> str:
121        """Strip `\\n\\r\\t` and whitespace from `text`."""
122        return text.strip(" \n\t\r")
123
124    # |==============================================================================|
125    # Overridables
126    # |==============================================================================|
127    def prescrape_chores(self):
128        """Chores to do before scraping."""
129        ...
130
131    def postscrape_chores(self):
132        """Chores to do after scraping."""
133        loggi.close(self.logger)
134
135    def get_parsable_items(self) -> list[ParsableItem]:
136        """Get relevant webpages and extract raw data that needs to be parsed.
137
138        e.g. first 10 results for an endpoint that returns json content
139        >>> return self.get_page(some_url).json()[:10]"""
140        raise NotImplementedError
141
142    def parse_item(self, item: ParsableItem) -> Any:
143        """Parse `item` and return parsed data.
144
145        e.g.
146        >>> try:
147        >>>     parsed = {}
148        >>>     parsed["thing1"] = item["element"].split()[0]
149        >>>     self.successes += 1
150        >>>     return parsed
151        >>> except Exception:
152        >>>     self.logger.exception("message")
153        >>>     self.failures += 1
154        >>>     return None"""
155        raise NotImplementedError
156
157    def store_item(self, item: Any):
158        """Store `item`."""
159        raise NotImplementedError
160
161    def _parse_items_no_prog_bar(self):
162        for item in self.parsable_items:
163            parsed_item = self.parse_item(item)
164            if parsed_item:
165                self.store_item(parsed_item)
166            # Append to `self.parsable_items` even if `None`
167            # so `parsable_items` and `parsed_items` are equal length
168            self.parsed_items.append(parsed_item)
169
170    def _parse_items_prog_bar(self):
171        with ProgBar(len(self.parsable_items)) as bar:
172            for item in self.parsable_items:
173                parsed_item = self.parse_item(item)
174                if parsed_item:
175                    self.store_item(parsed_item)
176                    bar.display(f"{bar.runtime}")
177                # Append to `self.parsable_items` even if `None`
178                # so `parsable_items` and `parsed_items` are equal length
179                self.parsed_items.append(parsed_item)
180
181    def scrape(self, parse_items_prog_bar_display: bool = False):
182        """Run the scraper:
183        1. prescrape chores
184        2. get parsable items
185        3. parse and store items
186        5. postscrape chores"""
187        try:
188            self.timer.start()
189            self.logger.info("Scrape started.")
190            self.prescrape_chores()
191            try:
192                self.parsable_items = self.get_parsable_items()
193                self.logger.info(
194                    f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items"
195                )
196            except Exception:
197                self.failed_to_get_parsable_items = True
198                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
199            else:
200                if parse_items_prog_bar_display:
201                    self._parse_items_prog_bar()
202                else:
203                    self._parse_items_no_prog_bar()
204                self.logger.info(
205                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
206                )
207        except Exception:
208            self.unexpected_failure_occured = True
209            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
210        self.postscrape_chores()

Scraper base class.

Classes subclassing Gruel need to implement the following methods:

  • get_parsable_items(self) -> list[Any]
  • parse_item(self, item: Any)->Any
  • store_item(self, item: Any)

Calling the scrape() method will execute:

  1. self.prescrape_chores() (does nothing unless overridden)
  2. self.get_parsable_items()
  3. self.parse_item() for each item returned by self.get_parsable_items()
  4. self.store_item() for each successfully parsed item
  5. self.postscrape_chores() (only closes this instance's log file unless overridden)

When overriding self.postscrape_chores, it's recommended to either call super().postscrape_chores() or make sure to call self.log.close(). Otherwise running a large number of scrapers can cause file handle limit issues.

Gruel( name: str | None = None, log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
38        """
39        :params:
40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
42        * `log_dir`: The directory this scraper's logs should be saved to.
43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
44        """
45        self._name = name
46        self._init_logger(log_dir)
47        self.timer = Timer()
48        self.success_count = 0
49        self.fail_count = 0
50        self.failed_to_get_parsable_items = False
51        self.unexpected_failure_occured = False
52        self.parsable_items = []
53        self.parsed_items = []

:params:

  • name: The name of this scraper. If None, the name will be the stem of the file this class/subclass was defined in. i.e. A Gruel subclass located in a file called myscraper.py will have the name "myscraper".
  • log_dir: The directory this scraper's logs should be saved to. If None, the logs will be written to a folder called "gruel_logs" within the current working directory.
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

had_failures: bool

True if getting parsable items, parsing items, or unexpected failures occured.

@staticmethod
def request( url: str, method: str = 'get', headers: dict[str, str] = {}, params: dict | None = None, data: dict | None = None, timeout: int | None = None, retry_on_fail: bool = True) -> requests.models.Response:
 73    @staticmethod
 74    def request(
 75        url: str,
 76        method: str = "get",
 77        headers: dict[str, str] = {},
 78        params: dict | None = None,
 79        data: dict | None = None,
 80        timeout: int | None = None,
 81        retry_on_fail: bool = True,
 82    ) -> requests.Response:
 83        """Send a request to `url` and return the `requests.Response` object.
 84
 85        By default, the only header sent is a randomized user agent string.
 86
 87        This can be overridden by supplying a user agent in the `headers` param.
 88
 89        If `retry_on_fail` is `True`, the request will be repeated after 1 second if the originally request causes an exception to be thrown.
 90        Otherwise, the exception will be raised."""
 91        args = [method, url]
 92        headers = whosyouragent.get_header() | headers
 93        kwargs = {
 94            "headers": headers,
 95            "timeout": timeout,
 96            "params": params,
 97            "data": data,
 98        }
 99        try:
100            response = requests.request(*args, **kwargs)
101            return response
102        except Exception as e:
103            if retry_on_fail:
104                time.sleep(1)
105                return requests.request(*args, **kwargs)
106            else:
107                raise e

Send a request to url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

If retry_on_fail is True, the request will be repeated after 1 second if the originally request causes an exception to be thrown. Otherwise, the exception will be raised.

@staticmethod
def as_soup(response: requests.models.Response) -> bs4.BeautifulSoup:
109    @staticmethod
110    def as_soup(response: requests.Response) -> BeautifulSoup:
111        """Returns the text content of `response` as a `BeautifulSoup` object."""
112        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
114    def get_soup(
115        self, url: str, method: str = "get", headers: dict[str, str] = {}
116    ) -> BeautifulSoup:
117        """Request `url` with `headers` and return `BeautifulSoup` object."""
118        return self.as_soup(self.request(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
120    def clean_string(self, text: str) -> str:
121        """Strip `\\n\\r\\t` and whitespace from `text`."""
122        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
127    def prescrape_chores(self):
128        """Chores to do before scraping."""
129        ...

Chores to do before scraping.

def postscrape_chores(self):
131    def postscrape_chores(self):
132        """Chores to do after scraping."""
133        loggi.close(self.logger)

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
135    def get_parsable_items(self) -> list[ParsableItem]:
136        """Get relevant webpages and extract raw data that needs to be parsed.
137
138        e.g. first 10 results for an endpoint that returns json content
139        >>> return self.get_page(some_url).json()[:10]"""
140        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
142    def parse_item(self, item: ParsableItem) -> Any:
143        """Parse `item` and return parsed data.
144
145        e.g.
146        >>> try:
147        >>>     parsed = {}
148        >>>     parsed["thing1"] = item["element"].split()[0]
149        >>>     self.successes += 1
150        >>>     return parsed
151        >>> except Exception:
152        >>>     self.logger.exception("message")
153        >>>     self.failures += 1
154        >>>     return None"""
155        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
157    def store_item(self, item: Any):
158        """Store `item`."""
159        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
181    def scrape(self, parse_items_prog_bar_display: bool = False):
182        """Run the scraper:
183        1. prescrape chores
184        2. get parsable items
185        3. parse and store items
186        5. postscrape chores"""
187        try:
188            self.timer.start()
189            self.logger.info("Scrape started.")
190            self.prescrape_chores()
191            try:
192                self.parsable_items = self.get_parsable_items()
193                self.logger.info(
194                    f"{self.name}:get_parsable_items() returned {(len(self.parsable_items))} items"
195                )
196            except Exception:
197                self.failed_to_get_parsable_items = True
198                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
199            else:
200                if parse_items_prog_bar_display:
201                    self._parse_items_prog_bar()
202                else:
203                    self._parse_items_no_prog_bar()
204                self.logger.info(
205                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
206                )
207        except Exception:
208            self.unexpected_failure_occured = True
209            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
210        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse and store items
  4. postscrape chores