gruel.gruel

  1import inspect
  2import logging
  3import time
  4from typing import Any
  5
  6import requests
  7from bs4 import BeautifulSoup, Tag
  8from noiftimer import Timer
  9from pathier import Pathier
 10from printbuddies import ProgBar
 11from whosyouragent import get_agent
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class."""
 18
 19    def __init__(self, name: str | None = None):
 20        self._name = name
 21        self._init_logger()
 22        self.timer = Timer()
 23        self.success_count = 0
 24        self.fail_count = 0
 25
 26    @property
 27    def name(self) -> str:
 28        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 29        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 30
 31    def _init_logger(self):
 32        log_dir = Pathier.cwd() / "logs"
 33        log_dir.mkdir()
 34        self.logger = logging.getLogger(self.name)
 35        if not self.logger.hasHandlers():
 36            handler = logging.FileHandler(
 37                (log_dir / self.name).with_suffix(".log"), encoding="utf-8"
 38            )
 39            handler.setFormatter(
 40                logging.Formatter(
 41                    "{levelname}|-|{asctime}|-|{message}",
 42                    style="{",
 43                    datefmt="%m/%d/%Y %I:%M:%S %p",
 44                )
 45            )
 46            self.logger.addHandler(handler)
 47            self.logger.setLevel(logging.INFO)
 48
 49    def get_page(
 50        self, url: str, method: str = "get", headers: dict[str, str] = {}
 51    ) -> requests.Response:
 52        """Request `url` and return the `requests.Response` object.
 53
 54        By default, the only header sent is a randomized user agent string.
 55
 56        This can be overridden by supplying a user agent in the `headers` param."""
 57        try:
 58            return requests.request(
 59                method, url, headers={"User-Agent": get_agent()} | headers
 60            )
 61        except Exception as e:
 62            time.sleep(1)
 63            return requests.request(
 64                method, url, headers={"User-Agent": get_agent()} | headers
 65            )
 66
 67    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 68        """Returns the text content of `response` as a `BeautifulSoup` object."""
 69        return BeautifulSoup(response.text, "html.parser")
 70
 71    def get_soup(
 72        self, url: str, method: str = "get", headers: dict[str, str] = {}
 73    ) -> BeautifulSoup:
 74        """Request `url` with `headers` and return `BeautifulSoup` object."""
 75        return self.as_soup(self.get_page(url, method, headers))
 76
 77    def clean_string(self, text: str) -> str:
 78        """Strip `\\n\\r\\t` and whitespace from `text`."""
 79        return text.strip(" \n\t\r")
 80
 81    def prescrape_chores(self):
 82        """Chores to do before scraping."""
 83        ...
 84
 85    def postscrape_chores(self):
 86        """Chores to do after scraping."""
 87        ...
 88
 89    def get_parsable_items(self) -> list[ParsableItem]:
 90        """Get relevant webpages and extract raw data that needs to be parsed.
 91
 92        e.g. first 10 results for an endpoint that returns json content
 93        >>> return self.get_page(some_url).json()[:10]"""
 94        raise NotImplementedError
 95
 96    def parse_item(self, item: ParsableItem) -> Any:
 97        """Parse `item` and return parsed data.
 98
 99        e.g.
100        >>> try:
101        >>>     parsed = {}
102        >>>     parsed["thing1"] = item["element"].split()[0]
103        >>>     self.successes += 1
104        >>>     return parsed
105        >>> except Exception:
106        >>>     self.logger.exception("message")
107        >>>     self.failures += 1
108        >>>     return None"""
109        raise NotImplementedError
110
111    def store_item(self, item: Any):
112        """Store `item`."""
113        raise NotImplementedError
114
115    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
116        for item in parsable_items:
117            parsed_item = self.parse_item(item)
118            if parsed_item:
119                self.store_item(parsed_item)
120
121    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
122        with ProgBar(len(parsable_items)) as bar:
123            for item in parsable_items:
124                parsed_item = self.parse_item(item)
125                if parsed_item:
126                    self.store_item(parsed_item)
127                    bar.display(f"{bar.runtime}")
128
129    def scrape(self, parse_items_prog_bar_display: bool = False):
130        """Run the scraper:
131        1. prescrape chores
132        2. get parsable items
133        3. parse items
134        4. store items
135        5. postscrape chores"""
136        try:
137            self.timer.start()
138            self.logger.info("Scrape started.")
139            self.prescrape_chores()
140            try:
141                parsable_items = self.get_parsable_items()
142                self.logger.info(
143                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
144                )
145            except Exception:
146                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
147            else:
148                if parse_items_prog_bar_display:
149                    self._parse_items_prog_bar(parsable_items)
150                else:
151                    self._parse_items_no_prog_bar(parsable_items)
152                self.logger.info(
153                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
154                )
155        except Exception:
156            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
157        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class."""
 19
 20    def __init__(self, name: str | None = None):
 21        self._name = name
 22        self._init_logger()
 23        self.timer = Timer()
 24        self.success_count = 0
 25        self.fail_count = 0
 26
 27    @property
 28    def name(self) -> str:
 29        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 30        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 31
 32    def _init_logger(self):
 33        log_dir = Pathier.cwd() / "logs"
 34        log_dir.mkdir()
 35        self.logger = logging.getLogger(self.name)
 36        if not self.logger.hasHandlers():
 37            handler = logging.FileHandler(
 38                (log_dir / self.name).with_suffix(".log"), encoding="utf-8"
 39            )
 40            handler.setFormatter(
 41                logging.Formatter(
 42                    "{levelname}|-|{asctime}|-|{message}",
 43                    style="{",
 44                    datefmt="%m/%d/%Y %I:%M:%S %p",
 45                )
 46            )
 47            self.logger.addHandler(handler)
 48            self.logger.setLevel(logging.INFO)
 49
 50    def get_page(
 51        self, url: str, method: str = "get", headers: dict[str, str] = {}
 52    ) -> requests.Response:
 53        """Request `url` and return the `requests.Response` object.
 54
 55        By default, the only header sent is a randomized user agent string.
 56
 57        This can be overridden by supplying a user agent in the `headers` param."""
 58        try:
 59            return requests.request(
 60                method, url, headers={"User-Agent": get_agent()} | headers
 61            )
 62        except Exception as e:
 63            time.sleep(1)
 64            return requests.request(
 65                method, url, headers={"User-Agent": get_agent()} | headers
 66            )
 67
 68    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 69        """Returns the text content of `response` as a `BeautifulSoup` object."""
 70        return BeautifulSoup(response.text, "html.parser")
 71
 72    def get_soup(
 73        self, url: str, method: str = "get", headers: dict[str, str] = {}
 74    ) -> BeautifulSoup:
 75        """Request `url` with `headers` and return `BeautifulSoup` object."""
 76        return self.as_soup(self.get_page(url, method, headers))
 77
 78    def clean_string(self, text: str) -> str:
 79        """Strip `\\n\\r\\t` and whitespace from `text`."""
 80        return text.strip(" \n\t\r")
 81
 82    def prescrape_chores(self):
 83        """Chores to do before scraping."""
 84        ...
 85
 86    def postscrape_chores(self):
 87        """Chores to do after scraping."""
 88        ...
 89
 90    def get_parsable_items(self) -> list[ParsableItem]:
 91        """Get relevant webpages and extract raw data that needs to be parsed.
 92
 93        e.g. first 10 results for an endpoint that returns json content
 94        >>> return self.get_page(some_url).json()[:10]"""
 95        raise NotImplementedError
 96
 97    def parse_item(self, item: ParsableItem) -> Any:
 98        """Parse `item` and return parsed data.
 99
100        e.g.
101        >>> try:
102        >>>     parsed = {}
103        >>>     parsed["thing1"] = item["element"].split()[0]
104        >>>     self.successes += 1
105        >>>     return parsed
106        >>> except Exception:
107        >>>     self.logger.exception("message")
108        >>>     self.failures += 1
109        >>>     return None"""
110        raise NotImplementedError
111
112    def store_item(self, item: Any):
113        """Store `item`."""
114        raise NotImplementedError
115
116    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
117        for item in parsable_items:
118            parsed_item = self.parse_item(item)
119            if parsed_item:
120                self.store_item(parsed_item)
121
122    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
123        with ProgBar(len(parsable_items)) as bar:
124            for item in parsable_items:
125                parsed_item = self.parse_item(item)
126                if parsed_item:
127                    self.store_item(parsed_item)
128                    bar.display(f"{bar.runtime}")
129
130    def scrape(self, parse_items_prog_bar_display: bool = False):
131        """Run the scraper:
132        1. prescrape chores
133        2. get parsable items
134        3. parse items
135        4. store items
136        5. postscrape chores"""
137        try:
138            self.timer.start()
139            self.logger.info("Scrape started.")
140            self.prescrape_chores()
141            try:
142                parsable_items = self.get_parsable_items()
143                self.logger.info(
144                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
145                )
146            except Exception:
147                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
148            else:
149                if parse_items_prog_bar_display:
150                    self._parse_items_prog_bar(parsable_items)
151                else:
152                    self._parse_items_no_prog_bar(parsable_items)
153                self.logger.info(
154                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
155                )
156        except Exception:
157            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
158        self.postscrape_chores()

Scraper base class.

Gruel(name: str | None = None)
20    def __init__(self, name: str | None = None):
21        self._name = name
22        self._init_logger()
23        self.timer = Timer()
24        self.success_count = 0
25        self.fail_count = 0
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

def get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
50    def get_page(
51        self, url: str, method: str = "get", headers: dict[str, str] = {}
52    ) -> requests.Response:
53        """Request `url` and return the `requests.Response` object.
54
55        By default, the only header sent is a randomized user agent string.
56
57        This can be overridden by supplying a user agent in the `headers` param."""
58        try:
59            return requests.request(
60                method, url, headers={"User-Agent": get_agent()} | headers
61            )
62        except Exception as e:
63            time.sleep(1)
64            return requests.request(
65                method, url, headers={"User-Agent": get_agent()} | headers
66            )

Request url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

def as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
68    def as_soup(self, response: requests.Response) -> BeautifulSoup:
69        """Returns the text content of `response` as a `BeautifulSoup` object."""
70        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
72    def get_soup(
73        self, url: str, method: str = "get", headers: dict[str, str] = {}
74    ) -> BeautifulSoup:
75        """Request `url` with `headers` and return `BeautifulSoup` object."""
76        return self.as_soup(self.get_page(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
78    def clean_string(self, text: str) -> str:
79        """Strip `\\n\\r\\t` and whitespace from `text`."""
80        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
82    def prescrape_chores(self):
83        """Chores to do before scraping."""
84        ...

Chores to do before scraping.

def postscrape_chores(self):
86    def postscrape_chores(self):
87        """Chores to do after scraping."""
88        ...

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
90    def get_parsable_items(self) -> list[ParsableItem]:
91        """Get relevant webpages and extract raw data that needs to be parsed.
92
93        e.g. first 10 results for an endpoint that returns json content
94        >>> return self.get_page(some_url).json()[:10]"""
95        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
 97    def parse_item(self, item: ParsableItem) -> Any:
 98        """Parse `item` and return parsed data.
 99
100        e.g.
101        >>> try:
102        >>>     parsed = {}
103        >>>     parsed["thing1"] = item["element"].split()[0]
104        >>>     self.successes += 1
105        >>>     return parsed
106        >>> except Exception:
107        >>>     self.logger.exception("message")
108        >>>     self.failures += 1
109        >>>     return None"""
110        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
112    def store_item(self, item: Any):
113        """Store `item`."""
114        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
130    def scrape(self, parse_items_prog_bar_display: bool = False):
131        """Run the scraper:
132        1. prescrape chores
133        2. get parsable items
134        3. parse items
135        4. store items
136        5. postscrape chores"""
137        try:
138            self.timer.start()
139            self.logger.info("Scrape started.")
140            self.prescrape_chores()
141            try:
142                parsable_items = self.get_parsable_items()
143                self.logger.info(
144                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
145                )
146            except Exception:
147                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
148            else:
149                if parse_items_prog_bar_display:
150                    self._parse_items_prog_bar(parsable_items)
151                else:
152                    self._parse_items_no_prog_bar(parsable_items)
153                self.logger.info(
154                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
155                )
156        except Exception:
157            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
158        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse items
  4. store items
  5. postscrape chores