gruel.gruel

  1import inspect
  2import time
  3from typing import Any
  4
  5import loggi
  6import requests
  7from bs4 import BeautifulSoup, Tag
  8from noiftimer import Timer
  9from pathier import Pathier
 10from printbuddies import ProgBar
 11from whosyouragent import get_agent
 12
 13ParsableItem = dict | str | Tag
 14
 15
 16class Gruel:
 17    """Scraper base class."""
 18
 19    def __init__(self, name: str | None = None):
 20        self._name = name
 21        self._init_logger()
 22        self.timer = Timer()
 23        self.success_count = 0
 24        self.fail_count = 0
 25
 26    @property
 27    def name(self) -> str:
 28        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 29        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 30
 31    def _init_logger(self):
 32        log_dir = Pathier.cwd() / "gruel_logs"
 33        log_dir.mkdir()
 34        self.logger = loggi.getLogger(self.name)
 35
 36    def get_page(
 37        self, url: str, method: str = "get", headers: dict[str, str] = {}
 38    ) -> requests.Response:
 39        """Request `url` and return the `requests.Response` object.
 40
 41        By default, the only header sent is a randomized user agent string.
 42
 43        This can be overridden by supplying a user agent in the `headers` param."""
 44        try:
 45            return requests.request(
 46                method, url, headers={"User-Agent": get_agent()} | headers
 47            )
 48        except Exception as e:
 49            time.sleep(1)
 50            return requests.request(
 51                method, url, headers={"User-Agent": get_agent()} | headers
 52            )
 53
 54    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 55        """Returns the text content of `response` as a `BeautifulSoup` object."""
 56        return BeautifulSoup(response.text, "html.parser")
 57
 58    def get_soup(
 59        self, url: str, method: str = "get", headers: dict[str, str] = {}
 60    ) -> BeautifulSoup:
 61        """Request `url` with `headers` and return `BeautifulSoup` object."""
 62        return self.as_soup(self.get_page(url, method, headers))
 63
 64    def clean_string(self, text: str) -> str:
 65        """Strip `\\n\\r\\t` and whitespace from `text`."""
 66        return text.strip(" \n\t\r")
 67
 68    def prescrape_chores(self):
 69        """Chores to do before scraping."""
 70        ...
 71
 72    def postscrape_chores(self):
 73        """Chores to do after scraping."""
 74        ...
 75
 76    def get_parsable_items(self) -> list[ParsableItem]:
 77        """Get relevant webpages and extract raw data that needs to be parsed.
 78
 79        e.g. first 10 results for an endpoint that returns json content
 80        >>> return self.get_page(some_url).json()[:10]"""
 81        raise NotImplementedError
 82
 83    def parse_item(self, item: ParsableItem) -> Any:
 84        """Parse `item` and return parsed data.
 85
 86        e.g.
 87        >>> try:
 88        >>>     parsed = {}
 89        >>>     parsed["thing1"] = item["element"].split()[0]
 90        >>>     self.successes += 1
 91        >>>     return parsed
 92        >>> except Exception:
 93        >>>     self.logger.exception("message")
 94        >>>     self.failures += 1
 95        >>>     return None"""
 96        raise NotImplementedError
 97
 98    def store_item(self, item: Any):
 99        """Store `item`."""
100        raise NotImplementedError
101
102    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
103        for item in parsable_items:
104            parsed_item = self.parse_item(item)
105            if parsed_item:
106                self.store_item(parsed_item)
107
108    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
109        with ProgBar(len(parsable_items)) as bar:
110            for item in parsable_items:
111                parsed_item = self.parse_item(item)
112                if parsed_item:
113                    self.store_item(parsed_item)
114                    bar.display(f"{bar.runtime}")
115
116    def scrape(self, parse_items_prog_bar_display: bool = False):
117        """Run the scraper:
118        1. prescrape chores
119        2. get parsable items
120        3. parse items
121        4. store items
122        5. postscrape chores"""
123        try:
124            self.timer.start()
125            self.logger.info("Scrape started.")
126            self.prescrape_chores()
127            try:
128                parsable_items = self.get_parsable_items()
129                self.logger.info(
130                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
131                )
132            except Exception:
133                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
134            else:
135                if parse_items_prog_bar_display:
136                    self._parse_items_prog_bar(parsable_items)
137                else:
138                    self._parse_items_no_prog_bar(parsable_items)
139                self.logger.info(
140                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
141                )
142        except Exception:
143            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
144        self.postscrape_chores()
class Gruel:
 17class Gruel:
 18    """Scraper base class."""
 19
 20    def __init__(self, name: str | None = None):
 21        self._name = name
 22        self._init_logger()
 23        self.timer = Timer()
 24        self.success_count = 0
 25        self.fail_count = 0
 26
 27    @property
 28    def name(self) -> str:
 29        """Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given."""
 30        return self._name or Pathier(inspect.getsourcefile(type(self))).stem  # type: ignore
 31
 32    def _init_logger(self):
 33        log_dir = Pathier.cwd() / "gruel_logs"
 34        log_dir.mkdir()
 35        self.logger = loggi.getLogger(self.name)
 36
 37    def get_page(
 38        self, url: str, method: str = "get", headers: dict[str, str] = {}
 39    ) -> requests.Response:
 40        """Request `url` and return the `requests.Response` object.
 41
 42        By default, the only header sent is a randomized user agent string.
 43
 44        This can be overridden by supplying a user agent in the `headers` param."""
 45        try:
 46            return requests.request(
 47                method, url, headers={"User-Agent": get_agent()} | headers
 48            )
 49        except Exception as e:
 50            time.sleep(1)
 51            return requests.request(
 52                method, url, headers={"User-Agent": get_agent()} | headers
 53            )
 54
 55    def as_soup(self, response: requests.Response) -> BeautifulSoup:
 56        """Returns the text content of `response` as a `BeautifulSoup` object."""
 57        return BeautifulSoup(response.text, "html.parser")
 58
 59    def get_soup(
 60        self, url: str, method: str = "get", headers: dict[str, str] = {}
 61    ) -> BeautifulSoup:
 62        """Request `url` with `headers` and return `BeautifulSoup` object."""
 63        return self.as_soup(self.get_page(url, method, headers))
 64
 65    def clean_string(self, text: str) -> str:
 66        """Strip `\\n\\r\\t` and whitespace from `text`."""
 67        return text.strip(" \n\t\r")
 68
 69    def prescrape_chores(self):
 70        """Chores to do before scraping."""
 71        ...
 72
 73    def postscrape_chores(self):
 74        """Chores to do after scraping."""
 75        ...
 76
 77    def get_parsable_items(self) -> list[ParsableItem]:
 78        """Get relevant webpages and extract raw data that needs to be parsed.
 79
 80        e.g. first 10 results for an endpoint that returns json content
 81        >>> return self.get_page(some_url).json()[:10]"""
 82        raise NotImplementedError
 83
 84    def parse_item(self, item: ParsableItem) -> Any:
 85        """Parse `item` and return parsed data.
 86
 87        e.g.
 88        >>> try:
 89        >>>     parsed = {}
 90        >>>     parsed["thing1"] = item["element"].split()[0]
 91        >>>     self.successes += 1
 92        >>>     return parsed
 93        >>> except Exception:
 94        >>>     self.logger.exception("message")
 95        >>>     self.failures += 1
 96        >>>     return None"""
 97        raise NotImplementedError
 98
 99    def store_item(self, item: Any):
100        """Store `item`."""
101        raise NotImplementedError
102
103    def _parse_items_no_prog_bar(self, parsable_items: list[ParsableItem]):
104        for item in parsable_items:
105            parsed_item = self.parse_item(item)
106            if parsed_item:
107                self.store_item(parsed_item)
108
109    def _parse_items_prog_bar(self, parsable_items: list[ParsableItem]):
110        with ProgBar(len(parsable_items)) as bar:
111            for item in parsable_items:
112                parsed_item = self.parse_item(item)
113                if parsed_item:
114                    self.store_item(parsed_item)
115                    bar.display(f"{bar.runtime}")
116
117    def scrape(self, parse_items_prog_bar_display: bool = False):
118        """Run the scraper:
119        1. prescrape chores
120        2. get parsable items
121        3. parse items
122        4. store items
123        5. postscrape chores"""
124        try:
125            self.timer.start()
126            self.logger.info("Scrape started.")
127            self.prescrape_chores()
128            try:
129                parsable_items = self.get_parsable_items()
130                self.logger.info(
131                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
132                )
133            except Exception:
134                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
135            else:
136                if parse_items_prog_bar_display:
137                    self._parse_items_prog_bar(parsable_items)
138                else:
139                    self._parse_items_no_prog_bar(parsable_items)
140                self.logger.info(
141                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
142                )
143        except Exception:
144            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
145        self.postscrape_chores()

Scraper base class.

Gruel(name: str | None = None)
20    def __init__(self, name: str | None = None):
21        self._name = name
22        self._init_logger()
23        self.timer = Timer()
24        self.success_count = 0
25        self.fail_count = 0
name: str

Returns the name given to __init__ or the stem of the file this instance was defined in if one wasn't given.

def get_page( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> requests.models.Response:
37    def get_page(
38        self, url: str, method: str = "get", headers: dict[str, str] = {}
39    ) -> requests.Response:
40        """Request `url` and return the `requests.Response` object.
41
42        By default, the only header sent is a randomized user agent string.
43
44        This can be overridden by supplying a user agent in the `headers` param."""
45        try:
46            return requests.request(
47                method, url, headers={"User-Agent": get_agent()} | headers
48            )
49        except Exception as e:
50            time.sleep(1)
51            return requests.request(
52                method, url, headers={"User-Agent": get_agent()} | headers
53            )

Request url and return the requests.Response object.

By default, the only header sent is a randomized user agent string.

This can be overridden by supplying a user agent in the headers param.

def as_soup(self, response: requests.models.Response) -> bs4.BeautifulSoup:
55    def as_soup(self, response: requests.Response) -> BeautifulSoup:
56        """Returns the text content of `response` as a `BeautifulSoup` object."""
57        return BeautifulSoup(response.text, "html.parser")

Returns the text content of response as a BeautifulSoup object.

def get_soup( self, url: str, method: str = 'get', headers: dict[str, str] = {}) -> bs4.BeautifulSoup:
59    def get_soup(
60        self, url: str, method: str = "get", headers: dict[str, str] = {}
61    ) -> BeautifulSoup:
62        """Request `url` with `headers` and return `BeautifulSoup` object."""
63        return self.as_soup(self.get_page(url, method, headers))

Request url with headers and return BeautifulSoup object.

def clean_string(self, text: str) -> str:
65    def clean_string(self, text: str) -> str:
66        """Strip `\\n\\r\\t` and whitespace from `text`."""
67        return text.strip(" \n\t\r")

Strip \n\r\t and whitespace from text.

def prescrape_chores(self):
69    def prescrape_chores(self):
70        """Chores to do before scraping."""
71        ...

Chores to do before scraping.

def postscrape_chores(self):
73    def postscrape_chores(self):
74        """Chores to do after scraping."""
75        ...

Chores to do after scraping.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
77    def get_parsable_items(self) -> list[ParsableItem]:
78        """Get relevant webpages and extract raw data that needs to be parsed.
79
80        e.g. first 10 results for an endpoint that returns json content
81        >>> return self.get_page(some_url).json()[:10]"""
82        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
84    def parse_item(self, item: ParsableItem) -> Any:
85        """Parse `item` and return parsed data.
86
87        e.g.
88        >>> try:
89        >>>     parsed = {}
90        >>>     parsed["thing1"] = item["element"].split()[0]
91        >>>     self.successes += 1
92        >>>     return parsed
93        >>> except Exception:
94        >>>     self.logger.exception("message")
95        >>>     self.failures += 1
96        >>>     return None"""
97        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
 99    def store_item(self, item: Any):
100        """Store `item`."""
101        raise NotImplementedError

Store item.

def scrape(self, parse_items_prog_bar_display: bool = False):
117    def scrape(self, parse_items_prog_bar_display: bool = False):
118        """Run the scraper:
119        1. prescrape chores
120        2. get parsable items
121        3. parse items
122        4. store items
123        5. postscrape chores"""
124        try:
125            self.timer.start()
126            self.logger.info("Scrape started.")
127            self.prescrape_chores()
128            try:
129                parsable_items = self.get_parsable_items()
130                self.logger.info(
131                    f"{self.name}:get_parsable_items() returned {(len(parsable_items))} items"
132                )
133            except Exception:
134                self.logger.exception(f"Error in {self.name}:get_parsable_items().")
135            else:
136                if parse_items_prog_bar_display:
137                    self._parse_items_prog_bar(parsable_items)
138                else:
139                    self._parse_items_no_prog_bar(parsable_items)
140                self.logger.info(
141                    f"Scrape completed in {self.timer.elapsed_str} with {self.success_count} successes and {self.fail_count} failures."
142                )
143        except Exception:
144            self.logger.exception(f"Unexpected failure in {self.name}:scrape()")
145        self.postscrape_chores()

Run the scraper:

  1. prescrape chores
  2. get parsable items
  3. parse items
  4. store items
  5. postscrape chores