gruel.template
1from typing import Any 2 3from gruel import Gruel, ParsableItem 4 5 6class SubGruel(Gruel): 7 def get_parsable_items(self) -> list[ParsableItem]: 8 """Get relevant webpages and extract raw data that needs to be parsed. 9 10 e.g. first 10 results for an endpoint that returns json content 11 >>> return self.get_page(some_url).json()[:10]""" 12 raise NotImplementedError 13 14 def parse_item(self, item: ParsableItem) -> Any: 15 """Parse `item` and return parsed data. 16 17 e.g. 18 >>> try: 19 >>> parsed = {} 20 >>> parsed["thing1"] = item["element"].split()[0] 21 >>> self.successes += 1 22 >>> return parsed 23 >>> except Exception: 24 >>> self.logger.exception("message") 25 >>> self.failures += 1 26 >>> return None""" 27 raise NotImplementedError 28 29 def store_item(self, item: Any): 30 """Store `item`.""" 31 raise NotImplementedError
7class SubGruel(Gruel): 8 def get_parsable_items(self) -> list[ParsableItem]: 9 """Get relevant webpages and extract raw data that needs to be parsed. 10 11 e.g. first 10 results for an endpoint that returns json content 12 >>> return self.get_page(some_url).json()[:10]""" 13 raise NotImplementedError 14 15 def parse_item(self, item: ParsableItem) -> Any: 16 """Parse `item` and return parsed data. 17 18 e.g. 19 >>> try: 20 >>> parsed = {} 21 >>> parsed["thing1"] = item["element"].split()[0] 22 >>> self.successes += 1 23 >>> return parsed 24 >>> except Exception: 25 >>> self.logger.exception("message") 26 >>> self.failures += 1 27 >>> return None""" 28 raise NotImplementedError 29 30 def store_item(self, item: Any): 31 """Store `item`.""" 32 raise NotImplementedError
Scraper base class.
def
get_parsable_items(self) -> list[dict | str | bs4.element.Tag]:
8 def get_parsable_items(self) -> list[ParsableItem]: 9 """Get relevant webpages and extract raw data that needs to be parsed. 10 11 e.g. first 10 results for an endpoint that returns json content 12 >>> return self.get_page(some_url).json()[:10]""" 13 raise NotImplementedError
Get relevant webpages and extract raw data that needs to be parsed.
e.g. first 10 results for an endpoint that returns json content
>>> return self.get_page(some_url).json()[:10]
def
parse_item(self, item: dict | str | bs4.element.Tag) -> Any:
15 def parse_item(self, item: ParsableItem) -> Any: 16 """Parse `item` and return parsed data. 17 18 e.g. 19 >>> try: 20 >>> parsed = {} 21 >>> parsed["thing1"] = item["element"].split()[0] 22 >>> self.successes += 1 23 >>> return parsed 24 >>> except Exception: 25 >>> self.logger.exception("message") 26 >>> self.failures += 1 27 >>> return None""" 28 raise NotImplementedError
Parse item
and return parsed data.
e.g.
>>> try:
>>> parsed = {}
>>> parsed["thing1"] = item["element"].split()[0]
>>> self.successes += 1
>>> return parsed
>>> except Exception:
>>> self.logger.exception("message")
>>> self.failures += 1
>>> return None