gruel.template

 1from typing import Any
 2
 3from gruel import Gruel, ParsableItem
 4
 5
 6class SubGruel(Gruel):
 7    def get_parsable_items(self) -> list[ParsableItem]:
 8        """Get relevant webpages and extract raw data that needs to be parsed.
 9
10        e.g. first 10 results for an endpoint that returns json content
11        >>> return self.get_page(some_url).json()[:10]"""
12        raise NotImplementedError
13
14    def parse_item(self, item: ParsableItem) -> Any:
15        """Parse `item` and return parsed data.
16
17        e.g.
18        >>> try:
19        >>>     parsed = {}
20        >>>     parsed["thing1"] = item["element"].split()[0]
21        >>>     self.successes += 1
22        >>>     return parsed
23        >>> except Exception:
24        >>>     self.logger.exception("message")
25        >>>     self.failures += 1
26        >>>     return None"""
27        raise NotImplementedError
28
29    def store_item(self, item: Any):
30        """Store `item`."""
31        raise NotImplementedError
class SubGruel(gruel.gruel.Gruel):
 7class SubGruel(Gruel):
 8    def get_parsable_items(self) -> list[ParsableItem]:
 9        """Get relevant webpages and extract raw data that needs to be parsed.
10
11        e.g. first 10 results for an endpoint that returns json content
12        >>> return self.get_page(some_url).json()[:10]"""
13        raise NotImplementedError
14
15    def parse_item(self, item: ParsableItem) -> Any:
16        """Parse `item` and return parsed data.
17
18        e.g.
19        >>> try:
20        >>>     parsed = {}
21        >>>     parsed["thing1"] = item["element"].split()[0]
22        >>>     self.successes += 1
23        >>>     return parsed
24        >>> except Exception:
25        >>>     self.logger.exception("message")
26        >>>     self.failures += 1
27        >>>     return None"""
28        raise NotImplementedError
29
30    def store_item(self, item: Any):
31        """Store `item`."""
32        raise NotImplementedError

Scraper base class.

Classes subclassing Gruel need to implement the following methods:

  • get_parsable_items(self) -> list[Any]
  • parse_item(self, item: Any)->Any
  • store_item(self, item: Any)

Calling the scrape() method will execute:

  1. self.prescrape_chores() (does nothing unless overridden)
  2. self.get_parsable_items()
  3. self.parse_item() for each item returned by self.get_parsable_items()
  4. self.store_item() for each successfully parsed item
  5. self.postscrape_chores() (only closes this instance's log file unless overridden)

When overriding self.postscrape_chores, it's recommended to either call super().postscrape_chores() or make sure to call self.log.close(). Otherwise running a large number of scrapers can cause file handle limit issues.

def get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
 8    def get_parsable_items(self) -> list[ParsableItem]:
 9        """Get relevant webpages and extract raw data that needs to be parsed.
10
11        e.g. first 10 results for an endpoint that returns json content
12        >>> return self.get_page(some_url).json()[:10]"""
13        raise NotImplementedError

Get relevant webpages and extract raw data that needs to be parsed.

e.g. first 10 results for an endpoint that returns json content

>>> return self.get_page(some_url).json()[:10]
def parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
15    def parse_item(self, item: ParsableItem) -> Any:
16        """Parse `item` and return parsed data.
17
18        e.g.
19        >>> try:
20        >>>     parsed = {}
21        >>>     parsed["thing1"] = item["element"].split()[0]
22        >>>     self.successes += 1
23        >>>     return parsed
24        >>> except Exception:
25        >>>     self.logger.exception("message")
26        >>>     self.failures += 1
27        >>>     return None"""
28        raise NotImplementedError

Parse item and return parsed data.

e.g.

>>> try:
>>>     parsed = {}
>>>     parsed["thing1"] = item["element"].split()[0]
>>>     self.successes += 1
>>>     return parsed
>>> except Exception:
>>>     self.logger.exception("message")
>>>     self.failures += 1
>>>     return None
def store_item(self, item: Any):
30    def store_item(self, item: Any):
31        """Store `item`."""
32        raise NotImplementedError

Store item.