gruel.gruel

  1import inspect
  2import time
  3from typing import Any
  4
  5import loggi
  6import requests
  7from bs4 import BeautifulSoup, Tag
  8from noiftimer import Timer
  9from pathier import Pathier, Pathish
 10from printbuddies import ProgBar
 11from whosyouragent import get_agent
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class.
 18
 19    Classes subclassing `Gruel` need to implement the following methods:
 20
 21    * `get_parsable_items(self) -> list[Any]`
 22    * `parse_item(self, item: Any)->Any`
 23    * `store_item(self, item: Any)`
 24
 25    Calling the `scrape()` method will execute:
 26    1. `self.prescrape_chores()` (does nothing unless overridden)
 27    2. `self.get_parsable_items()`
 28    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 29    4. `self.store_item()` for each successfully parsed item
 30    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 31
 32    When overriding `self.postscrape_chores`, it's recommended to either
 33    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 34    Otherwise running a large number of scrapers can cause file handle limit issues."""
 35
 36    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 37        """
 38        :params:
 39        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 40        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 41        * `log_dir`: The directory this scraper's logs should be saved to.
 42        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 43        """
 44        self._name = name
 45        self._init_logger(log_dir)
 46        self.timer = Timer()
 47        self.success_count = 0
 48        self.fail_count = 0
 49
 50    @property
 51    def name(self) -> str:
 52        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 53        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 54
 55    def _init_logger(self, log_dir: Pathish | None):
 56        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 57        self.logger = loggi.getLogger(self.name, log_dir)
 58
 59    def get_page(
 60        self, url: str, method: str = "get", headers: dict[str, str] = {}
 61    ) -> requests.Response:
 62        """Request `url` and return the `requests.Response` object.
 63
 64        By default, the only header sent is a randomized user agent string.
 65
 66        This can be overridden by supplying a user agent in the `headers` param."""
 67        try:
 68            return requests.request(
 69                method, url, headers={"User-Agent": get_agent()} | headers
 70            )
 71        except Exception as e:
 72            time.sleep(1)
 73            return requests.request(
 74                method, url, headers={"User-Agent": get_agent()} | headers
 75            )
 76
 77    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 78        """Returns the text content of `response` as a `BeautifulSoup` object."""
 79        return BeautifulSoup(response.text, "html.parser")
 80
 81    def get_soup(
 82        self, url: str, method: str = "get", headers: dict[str, str] = {}
 83    ) -> BeautifulSoup:
 84        """Request `url` with `headers` and return `BeautifulSoup` object."""
 85        return self.as_soup(self.get_page(url, method, headers))
 86
 87    def clean_string(self, text: str) -> str:
 88        """Strip `\\n\\r\\t` and whitespace from `text`."""
 89        return text.strip(" \n\t\r")
 90
 91    def prescrape_chores(self):
 92        """Chores to do before scraping."""
 93        ...
 94
 95    def postscrape_chores(self):
 96        """Chores to do after scraping."""
 97        loggi.close(self.logger)
 98
 99    def get_parsable_items(self) -> list[ParsableItem]:
100        """Get relevant webpages and extract raw data that needs to be parsed.
101
102        e.g. first 10 results for an endpoint that returns json content
103        >>> return self.get_page(some_url).json()[:10]"""
104        raise NotImplementedError
105
106    def parse_item(self, item: ParsableItem) -> Any:
107        """Parse `item` and return parsed data.
108
109        e.g.
110        >>> try:
111        >>>     parsed = {}
112        >>>     parsed["thing1"] = item["element"].split()[0]
113        >>>     self.successes += 1
114        >>>     return parsed
115        >>> except Exception:
116        >>>     self.logger.exception("message")
117        >>>     self.failures += 1
118        >>>     return None"""
119        raise NotImplementedError
120
121    def store_item(self, item: Any):
122        """Store `item`."""
123        raise NotImplementedError
124
125    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
126        for item in parsable_items:
127            parsed_item = self.parse_item(item)
128            if parsed_item:
129                self.store_item(parsed_item)
130
131    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
132        with ProgBar(len(parsable_items)) as bar:
133            for item in parsable_items:
134                parsed_item = self.parse_item(item)
135                if parsed_item:
136                    self.store_item(parsed_item)
137                    bar.display(f"{bar.runtime}")
138
139    def scrape(self, parse_items_prog_bar_display: bool = False):
140        """Run the scraper:
141        1. prescrape chores
142        2. get parsable items
143        3. parse items
144        4. store items
145        5. postscrape chores"""
146        try:
147            self.timer.start()
148            self.logger.info("Scrape started.")
149            self.prescrape_chores()
150            try:
151                parsable_items = self.get_parsable_items()
152                self.logger.info(
153                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
154                )
155            except Exception:
156                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
157            else:
158                if parse_items_prog_bar_display:
159                    self._parse_items_prog_bar(parsable_items)
160                else:
161                    self._parse_items_no_prog_bar(parsable_items)
162                self.logger.info(
163                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
164                )
165        except Exception:
166            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
167        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class.
 19
 20    Classes subclassing `Gruel` need to implement the following methods:
 21
 22    * `get_parsable_items(self) -> list[Any]`
 23    * `parse_item(self, item: Any)->Any`
 24    * `store_item(self, item: Any)`
 25
 26    Calling the `scrape()` method will execute:
 27    1. `self.prescrape_chores()` (does nothing unless overridden)
 28    2. `self.get_parsable_items()`
 29    3. `self.parse_item()` for each item returned by `self.get_parsable_items()`
 30    4. `self.store_item()` for each successfully parsed item
 31    5. `self.postscrape_chores()` (only closes this instance's log file unless overridden)
 32
 33    When overriding `self.postscrape_chores`, it's recommended to either
 34    call `super().postscrape_chores()` or make sure to call `self.log.close()`.
 35    Otherwise running a large number of scrapers can cause file handle limit issues."""
 36
 37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
 38        """
 39        :params:
 40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
 41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
 42        * `log_dir`: The directory this scraper's logs should be saved to.
 43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
 44        """
 45        self._name = name
 46        self._init_logger(log_dir)
 47        self.timer = Timer()
 48        self.success_count = 0
 49        self.fail_count = 0
 50
 51    @property
 52    def name(self) -> str:
 53        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 54        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 55
 56    def _init_logger(self, log_dir: Pathish | None):
 57        log_dir = Pathier.cwd() / "gruel_logs" if not log_dir else Pathier(log_dir)
 58        self.logger = loggi.getLogger(self.name, log_dir)
 59
 60    def get_page(
 61        self, url: str, method: str = "get", headers: dict[str, str] = {}
 62    ) -> requests.Response:
 63        """Request `url` and return the `requests.Response` object.
 64
 65        By default, the only header sent is a randomized user agent string.
 66
 67        This can be overridden by supplying a user agent in the `headers` param."""
 68        try:
 69            return requests.request(
 70                method, url, headers={"User-Agent": get_agent()} | headers
 71            )
 72        except Exception as e:
 73            time.sleep(1)
 74            return requests.request(
 75                method, url, headers={"User-Agent": get_agent()} | headers
 76            )
 77
 78    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 79        """Returns the text content of `response` as a `BeautifulSoup` object."""
 80        return BeautifulSoup(response.text, "html.parser")
 81
 82    def get_soup(
 83        self, url: str, method: str = "get", headers: dict[str, str] = {}
 84    ) -> BeautifulSoup:
 85        """Request `url` with `headers` and return `BeautifulSoup` object."""
 86        return self.as_soup(self.get_page(url, method, headers))
 87
 88    def clean_string(self, text: str) -> str:
 89        """Strip `\\n\\r\\t` and whitespace from `text`."""
 90        return text.strip(" \n\t\r")
 91
 92    def prescrape_chores(self):
 93        """Chores to do before scraping."""
 94        ...
 95
 96    def postscrape_chores(self):
 97        """Chores to do after scraping."""
 98        loggi.close(self.logger)
 99
100    def get_parsable_items(self) -> list[ParsableItem]:
101        """Get relevant webpages and extract raw data that needs to be parsed.
102
103        e.g. first 10 results for an endpoint that returns json content
104        >>> return self.get_page(some_url).json()[:10]"""
105        raise NotImplementedError
106
107    def parse_item(self, item: ParsableItem) -> Any:
108        """Parse `item` and return parsed data.
109
110        e.g.
111        >>> try:
112        >>>     parsed = {}
113        >>>     parsed["thing1"] = item["element"].split()[0]
114        >>>     self.successes += 1
115        >>>     return parsed
116        >>> except Exception:
117        >>>     self.logger.exception("message")
118        >>>     self.failures += 1
119        >>>     return None"""
120        raise NotImplementedError
121
122    def store_item(self, item: Any):
123        """Store `item`."""
124        raise NotImplementedError
125
126    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
127        for item in parsable_items:
128            parsed_item = self.parse_item(item)
129            if parsed_item:
130                self.store_item(parsed_item)
131
132    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
133        with ProgBar(len(parsable_items)) as bar:
134            for item in parsable_items:
135                parsed_item = self.parse_item(item)
136                if parsed_item:
137                    self.store_item(parsed_item)
138                    bar.display(f"{bar.runtime}")
139
140    def scrape(self, parse_items_prog_bar_display: bool = False):
141        """Run the scraper:
142        1. prescrape chores
143        2. get parsable items
144        3. parse items
145        4. store items
146        5. postscrape chores"""
147        try:
148            self.timer.start()
149            self.logger.info("Scrape started.")
150            self.prescrape_chores()
151            try:
152                parsable_items = self.get_parsable_items()
153                self.logger.info(
154                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
155                )
156            except Exception:
157                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
158            else:
159                if parse_items_prog_bar_display:
160                    self._parse_items_prog_bar(parsable_items)
161                else:
162                    self._parse_items_no_prog_bar(parsable_items)
163                self.logger.info(
164                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
165                )
166        except Exception:
167            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
168        self.postscrape_chores()

Scraper base class.

Classes subclassing Gruel need to implement the following methods:

  • get_parsable_items(self) -> list[Any]
  • parse_item(self, item: Any)->Any
  • store_item(self, item: Any)

Calling the scrape() method will execute:

  1. self.prescrape_chores() (does nothing unless overridden)
  2. self.get_parsable_items()
  3. self.parse_item() for each item returned by self.get_parsable_items()
  4. self.store_item() for each successfully parsed item
  5. self.postscrape_chores() (only closes this instance's log file unless overridden)

When overriding self.postscrape_chores, it's recommended to either call super().postscrape_chores() or make sure to call self.log.close(). Otherwise running a large number of scrapers can cause file handle limit issues.

Gruel( name: str | None = None, log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
37    def __init__(self, name: str | None = None, log_dir: Pathish | None = None):
38        """
39        :params:
40        * `name`: The name of this scraper. If `None`, the name will be the stem of the file this class/subclass was defined in.
41        i.e. A `Gruel` subclass located in a file called `myscraper.py` will have the name `"myscraper"`.
42        * `log_dir`: The directory this scraper's logs should be saved to.
43        If `None`, the logs will be written to a folder called `"gruel_logs"` within the current working directory.
44        """
45        self._name = name
46        self._init_logger(log_dir)
47        self.timer = Timer()
48        self.success_count = 0
49        self.fail_count = 0

:params:

  • name: The name of this scraper. If None, the name will be the stem of the file this class/subclass was defined in. i.e. A Gruel subclass located in a file called myscraper.py will have the name "myscraper".
  • log_dir: The directory this scraper's logs should be saved to. If None, the logs will be written to a folder called "gruel_logs" within the current working directory.
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

def get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
60    def get_page(
61        self, url: str, method: str = "get", headers: dict[str, str] = {}
62    ) -> requests.Response:
63        """Request `url` and return the `requests.Response` object.
64
65        By default, the only header sent is a randomized user agent string.
66
67        This can be overridden by supplying a user agent in the `headers` param."""
68        try:
69            return requests.request(
70                method, url, headers={"User-Agent": get_agent()} | headers
71            )
72        except Exception as e:
73            time.sleep(1)
74            return requests.request(
75                method, url, headers={"User-Agent": get_agent()} | headers
76            )

Request url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

def as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
78    def as_soup(self, response: requests.Response) -> BeautifulSoup:
79        """Returns the text content of `response` as a `BeautifulSoup` object."""
80        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
82    def get_soup(
83        self, url: str, method: str = "get", headers: dict[str, str] = {}
84    ) -> BeautifulSoup:
85        """Request `url` with `headers` and return `BeautifulSoup` object."""
86        return self.as_soup(self.get_page(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
88    def clean_string(self, text: str) -> str:
89        """Strip `\\n\\r\\t` and whitespace from `text`."""
90        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
92    def prescrape_chores(self):
93        """Chores to do before scraping."""
94        ...

Chores to do before scraping.

def postscrape_chores(self):
96    def postscrape_chores(self):
97        """Chores to do after scraping."""
98        loggi.close(self.logger)

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
100    def get_parsable_items(self) -> list[ParsableItem]:
101        """Get relevant webpages and extract raw data that needs to be parsed.
102
103        e.g. first 10 results for an endpoint that returns json content
104        >>> return self.get_page(some_url).json()[:10]"""
105        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
107    def parse_item(self, item: ParsableItem) -> Any:
108        """Parse `item` and return parsed data.
109
110        e.g.
111        >>> try:
112        >>>     parsed = {}
113        >>>     parsed["thing1"] = item["element"].split()[0]
114        >>>     self.successes += 1
115        >>>     return parsed
116        >>> except Exception:
117        >>>     self.logger.exception("message")
118        >>>     self.failures += 1
119        >>>     return None"""
120        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
122    def store_item(self, item: Any):
123        """Store `item`."""
124        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
140    def scrape(self, parse_items_prog_bar_display: bool = False):
141        """Run the scraper:
142        1. prescrape chores
143        2. get parsable items
144        3. parse items
145        4. store items
146        5. postscrape chores"""
147        try:
148            self.timer.start()
149            self.logger.info("Scrape started.")
150            self.prescrape_chores()
151            try:
152                parsable_items = self.get_parsable_items()
153                self.logger.info(
154                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
155                )
156            except Exception:
157                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
158            else:
159                if parse_items_prog_bar_display:
160                    self._parse_items_prog_bar(parsable_items)
161                else:
162                    self._parse_items_no_prog_bar(parsable_items)
163                self.logger.info(
164                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
165                )
166        except Exception:
167            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
168        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse items
  4. store items
  5. postscrape chores