gruel.brewer

  1import argparse
  2import importlib
  3import importlib.machinery
  4import importlib.util
  5import inspect
  6from typing import Any
  7
  8import loggi
  9import quickpool
 10from pathier import Pathier, Pathish
 11from younotyou import younotyou
 12
 13from gruel import Gruel
 14
 15
 16class Brewer:
 17    def __init__(
 18        self,
 19        subgruel_classes: list[str],
 20        file_exclude_patterns: list[str] = [],
 21        scan_path: Pathish = Pathier.cwd(),
 22        file_include_patterns: list[str] = ["*.py"],
 23        recursive: bool = True,
 24        log_dir: Pathish | None = None,
 25    ):
 26        """Run `Gruel` scrapers.
 27
 28        #### :params:
 29
 30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 32
 33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 34
 35        `scan_path`: The path to scan for scraper classes.
 36
 37        `file_include_patterns`: Files that match these patterns will be scanned.
 38
 39        `recursive`: Whether the scan should be recursive or not.
 40
 41        `log_dir`: The directory this instance's log should be saved to.
 42        If `None`, it will be saved to the current working directory.
 43
 44        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 45        >>> brewer.brew()"""
 46        self._init_logger(log_dir)
 47        self.subgruel_classes = subgruel_classes
 48        self.file_exclude_patterns = file_exclude_patterns
 49        self.file_include_patterns = file_include_patterns
 50        self.scan_path = Pathier(scan_path)
 51        self.recursive = recursive
 52
 53    def _init_logger(self, log_dir: Pathish | None = None):
 54        # When Brewer is subclassed, use that file's stem instead of `brewer`
 55        log_dir = Pathier(log_dir) if log_dir else Pathier.cwd()
 56        source_file = inspect.getsourcefile(type(self))
 57        if source_file:
 58            log_name = Pathier(source_file).stem
 59        else:
 60            log_name = Pathier(__file__).stem
 61        self.logger = loggi.getLogger(log_name, log_dir)
 62
 63    def load_scrapers(self) -> list[Gruel]:
 64        """Load scraper classes that inherit from `Gruel`.
 65
 66        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 67
 68        #### :params:
 69
 70        `directory`: The path to scan for scraper classes.
 71
 72        `class_names`: A list of class names for scrapers that should be loaded.
 73        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 74
 75        `include_patterns`: Files that match these patterns will be scanned.
 76
 77        `exclude_patterns`: Files that match these patterns will not be scanned.
 78
 79        `recursive`: Whether the search should be recursive or not.
 80
 81        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
 82        """
 83        globber = self.scan_path.glob
 84        if self.recursive:
 85            globber = self.scan_path.rglob
 86        files = [
 87            str(file)
 88            for pattern in self.file_include_patterns
 89            for file in globber(pattern)
 90        ]
 91        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 92        self.modules = {}
 93        self._module_names = []
 94        for file in files:
 95            module_name = Pathier(file).stem
 96            try:
 97                module = importlib.machinery.SourceFileLoader(
 98                    module_name, file
 99                ).load_module()
100            except Exception as e:
101                self.logger.exception(
102                    f"Failed to load module '{module_name}' from '{file}'."
103                )
104            else:
105                self._module_names.append(module_name)
106                self.modules[module] = module
107        gruels = [
108            getattr(module, class_)
109            for module in self.modules.values()
110            for class_ in self.subgruel_classes
111            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
112        ]
113        self.logger.info(
114            "\n".join(
115                [f"Imported {len(gruels)} scrapers: "]
116                + [str(gruel) for gruel in gruels]
117            )
118        )
119        return gruels
120
121    def pop_modules(self):
122        """Unload modules."""
123        for module in self.modules:
124            del module
125        self._module_names = []
126
127    def get_bases(self, object: Any) -> list[Any]:
128        """Returns a recursive list of all the classes `object` inherits from."""
129        parents = []
130        bases = object.__bases__
131        if not bases:
132            return parents
133        for base in bases:
134            parents.append(base)
135            parents.extend(self.get_bases(base))
136        return parents
137
138    def is_subgruel(self, object: Any) -> bool:
139        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
140        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
141            return False
142        return True
143
144    def prescrape_chores(self):
145        """Override to add any tasks to be done before running the scrapers."""
146        ...
147
148    def postscrape_chores(self):
149        """Override to add any tasks to be done after running the scrapers."""
150        self.pop_modules()
151
152    def scrape(self, scrapers: list[Gruel]):
153        """Run the `scrape()` method for each scraper in `scrapers`.
154
155        Execution is multithreaded."""
156        execute = lambda scraper: scraper().scrape()
157        pool = quickpool.ThreadPool(
158            [execute] * len(scrapers), [(scraper,) for scraper in scrapers]
159        )
160        pool.execute()
161
162    def logprint(self, message: str):
163        """Log and print `message`."""
164        self.logger.info(message)
165        print(message)
166
167    def brew(self):
168        """Execute pipeline.
169
170        1. self.prescrape_chores()
171        2. self.load_scrapers()
172        3. self.scrape()
173        4. self.postscrape_chores()"""
174
175        try:
176            self.logprint("Beginning brew")
177            # 1--------------------------------------------
178            self.logprint("Executing prescrape chores")
179            self.prescrape_chores()
180            # 2--------------------------------------------
181            self.logprint("Loading scrapers")
182            scrapers = self.load_scrapers()
183            print(f"Loaded {len(scrapers)} scrapers")
184            # 3--------------------------------------------
185            self.logprint("Starting scrape")
186            self.scrape(scrapers)
187            self.logprint("Scrape complete")
188            # 4--------------------------------------------
189            self.logprint("Executing postscrape chores")
190            self.postscrape_chores()
191            self.logprint("Brew complete")
192        except Exception as e:
193            print(e)
194            self.logger.exception("Exception occured during brew():")
195
196
197def get_args() -> argparse.Namespace:
198    parser = argparse.ArgumentParser()
199
200    parser.add_argument(
201        "subgruel_classes",
202        type=str,
203        nargs="*",
204        help=""" A list of Gruel scraper class names to find and import. """,
205    )
206    parser.add_argument(
207        "-e",
208        "--excludes",
209        type=str,
210        nargs="*",
211        default=[],
212        help=""" A list of glob style file patterns to exclude from the scan. """,
213    )
214    parser.add_argument(
215        "-i",
216        "--includes",
217        type=str,
218        nargs="*",
219        default=["*.py"],
220        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
221    )
222    parser.add_argument(
223        "-p",
224        "--path",
225        type=str,
226        default=Pathier.cwd(),
227        help=""" The directory path to scan. Defaults to the current working directory. """,
228    )
229    parser.add_argument(
230        "-r",
231        "--recursive",
232        action="store_true",
233        help=""" Whether -p/--path should be scanned recursively or not. """,
234    )
235    parser.add_argument(
236        "-l",
237        "--log_dir",
238        type=str,
239        default=None,
240        help=""" The directory to save the brew log to.""",
241    )
242    args = parser.parse_args()
243    args.path = Pathier(args.path)
244
245    return args
246
247
248def main(args: argparse.Namespace | None = None):
249    if not args:
250        args = get_args()
251    brewer = Brewer(
252        args.subgruel_classes,
253        args.excludes,
254        args.path,
255        args.includes,
256        args.recursive,
257        args.log_dir,
258    )
259    brewer.brew()
260
261
262if __name__ == "__main__":
263    main(get_args())
class Brewer:
 17class Brewer:
 18    def __init__(
 19        self,
 20        subgruel_classes: list[str],
 21        file_exclude_patterns: list[str] = [],
 22        scan_path: Pathish = Pathier.cwd(),
 23        file_include_patterns: list[str] = ["*.py"],
 24        recursive: bool = True,
 25        log_dir: Pathish | None = None,
 26    ):
 27        """Run `Gruel` scrapers.
 28
 29        #### :params:
 30
 31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 33
 34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 35
 36        `scan_path`: The path to scan for scraper classes.
 37
 38        `file_include_patterns`: Files that match these patterns will be scanned.
 39
 40        `recursive`: Whether the scan should be recursive or not.
 41
 42        `log_dir`: The directory this instance's log should be saved to.
 43        If `None`, it will be saved to the current working directory.
 44
 45        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 46        >>> brewer.brew()"""
 47        self._init_logger(log_dir)
 48        self.subgruel_classes = subgruel_classes
 49        self.file_exclude_patterns = file_exclude_patterns
 50        self.file_include_patterns = file_include_patterns
 51        self.scan_path = Pathier(scan_path)
 52        self.recursive = recursive
 53
 54    def _init_logger(self, log_dir: Pathish | None = None):
 55        # When Brewer is subclassed, use that file's stem instead of `brewer`
 56        log_dir = Pathier(log_dir) if log_dir else Pathier.cwd()
 57        source_file = inspect.getsourcefile(type(self))
 58        if source_file:
 59            log_name = Pathier(source_file).stem
 60        else:
 61            log_name = Pathier(__file__).stem
 62        self.logger = loggi.getLogger(log_name, log_dir)
 63
 64    def load_scrapers(self) -> list[Gruel]:
 65        """Load scraper classes that inherit from `Gruel`.
 66
 67        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 68
 69        #### :params:
 70
 71        `directory`: The path to scan for scraper classes.
 72
 73        `class_names`: A list of class names for scrapers that should be loaded.
 74        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 75
 76        `include_patterns`: Files that match these patterns will be scanned.
 77
 78        `exclude_patterns`: Files that match these patterns will not be scanned.
 79
 80        `recursive`: Whether the search should be recursive or not.
 81
 82        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
 83        """
 84        globber = self.scan_path.glob
 85        if self.recursive:
 86            globber = self.scan_path.rglob
 87        files = [
 88            str(file)
 89            for pattern in self.file_include_patterns
 90            for file in globber(pattern)
 91        ]
 92        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 93        self.modules = {}
 94        self._module_names = []
 95        for file in files:
 96            module_name = Pathier(file).stem
 97            try:
 98                module = importlib.machinery.SourceFileLoader(
 99                    module_name, file
100                ).load_module()
101            except Exception as e:
102                self.logger.exception(
103                    f"Failed to load module '{module_name}' from '{file}'."
104                )
105            else:
106                self._module_names.append(module_name)
107                self.modules[module] = module
108        gruels = [
109            getattr(module, class_)
110            for module in self.modules.values()
111            for class_ in self.subgruel_classes
112            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
113        ]
114        self.logger.info(
115            "\n".join(
116                [f"Imported {len(gruels)} scrapers: "]
117                + [str(gruel) for gruel in gruels]
118            )
119        )
120        return gruels
121
122    def pop_modules(self):
123        """Unload modules."""
124        for module in self.modules:
125            del module
126        self._module_names = []
127
128    def get_bases(self, object: Any) -> list[Any]:
129        """Returns a recursive list of all the classes `object` inherits from."""
130        parents = []
131        bases = object.__bases__
132        if not bases:
133            return parents
134        for base in bases:
135            parents.append(base)
136            parents.extend(self.get_bases(base))
137        return parents
138
139    def is_subgruel(self, object: Any) -> bool:
140        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
141        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
142            return False
143        return True
144
145    def prescrape_chores(self):
146        """Override to add any tasks to be done before running the scrapers."""
147        ...
148
149    def postscrape_chores(self):
150        """Override to add any tasks to be done after running the scrapers."""
151        self.pop_modules()
152
153    def scrape(self, scrapers: list[Gruel]):
154        """Run the `scrape()` method for each scraper in `scrapers`.
155
156        Execution is multithreaded."""
157        execute = lambda scraper: scraper().scrape()
158        pool = quickpool.ThreadPool(
159            [execute] * len(scrapers), [(scraper,) for scraper in scrapers]
160        )
161        pool.execute()
162
163    def logprint(self, message: str):
164        """Log and print `message`."""
165        self.logger.info(message)
166        print(message)
167
168    def brew(self):
169        """Execute pipeline.
170
171        1. self.prescrape_chores()
172        2. self.load_scrapers()
173        3. self.scrape()
174        4. self.postscrape_chores()"""
175
176        try:
177            self.logprint("Beginning brew")
178            # 1--------------------------------------------
179            self.logprint("Executing prescrape chores")
180            self.prescrape_chores()
181            # 2--------------------------------------------
182            self.logprint("Loading scrapers")
183            scrapers = self.load_scrapers()
184            print(f"Loaded {len(scrapers)} scrapers")
185            # 3--------------------------------------------
186            self.logprint("Starting scrape")
187            self.scrape(scrapers)
188            self.logprint("Scrape complete")
189            # 4--------------------------------------------
190            self.logprint("Executing postscrape chores")
191            self.postscrape_chores()
192            self.logprint("Brew complete")
193        except Exception as e:
194            print(e)
195            self.logger.exception("Exception occured during brew():")
Brewer( subgruel_classes: list[str], file_exclude_patterns: list[str] = [], scan_path: pathier.pathier.Pathier | pathlib.Path | str = WindowsPath('E:/1vsCode/python/gruel'), file_include_patterns: list[str] = ['*.py'], recursive: bool = True, log_dir: pathier.pathier.Pathier | pathlib.Path | str | None = None)
18    def __init__(
19        self,
20        subgruel_classes: list[str],
21        file_exclude_patterns: list[str] = [],
22        scan_path: Pathish = Pathier.cwd(),
23        file_include_patterns: list[str] = ["*.py"],
24        recursive: bool = True,
25        log_dir: Pathish | None = None,
26    ):
27        """Run `Gruel` scrapers.
28
29        #### :params:
30
31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
33
34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
35
36        `scan_path`: The path to scan for scraper classes.
37
38        `file_include_patterns`: Files that match these patterns will be scanned.
39
40        `recursive`: Whether the scan should be recursive or not.
41
42        `log_dir`: The directory this instance's log should be saved to.
43        If `None`, it will be saved to the current working directory.
44
45        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
46        >>> brewer.brew()"""
47        self._init_logger(log_dir)
48        self.subgruel_classes = subgruel_classes
49        self.file_exclude_patterns = file_exclude_patterns
50        self.file_include_patterns = file_include_patterns
51        self.scan_path = Pathier(scan_path)
52        self.recursive = recursive

Run Gruel scrapers.

:params:

subgruel_classes: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

file_exclude_patterns: Files that match these patterns will not be scanned.

scan_path: The path to scan for scraper classes.

file_include_patterns: Files that match these patterns will be scanned.

recursive: Whether the scan should be recursive or not.

log_dir: The directory this instance's log should be saved to. If None, it will be saved to the current working directory.

>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
def load_scrapers(self) -> list[gruel.gruel.Gruel]:
 64    def load_scrapers(self) -> list[Gruel]:
 65        """Load scraper classes that inherit from `Gruel`.
 66
 67        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 68
 69        #### :params:
 70
 71        `directory`: The path to scan for scraper classes.
 72
 73        `class_names`: A list of class names for scrapers that should be loaded.
 74        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 75
 76        `include_patterns`: Files that match these patterns will be scanned.
 77
 78        `exclude_patterns`: Files that match these patterns will not be scanned.
 79
 80        `recursive`: Whether the search should be recursive or not.
 81
 82        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
 83        """
 84        globber = self.scan_path.glob
 85        if self.recursive:
 86            globber = self.scan_path.rglob
 87        files = [
 88            str(file)
 89            for pattern in self.file_include_patterns
 90            for file in globber(pattern)
 91        ]
 92        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 93        self.modules = {}
 94        self._module_names = []
 95        for file in files:
 96            module_name = Pathier(file).stem
 97            try:
 98                module = importlib.machinery.SourceFileLoader(
 99                    module_name, file
100                ).load_module()
101            except Exception as e:
102                self.logger.exception(
103                    f"Failed to load module '{module_name}' from '{file}'."
104                )
105            else:
106                self._module_names.append(module_name)
107                self.modules[module] = module
108        gruels = [
109            getattr(module, class_)
110            for module in self.modules.values()
111            for class_ in self.subgruel_classes
112            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
113        ]
114        self.logger.info(
115            "\n".join(
116                [f"Imported {len(gruels)} scrapers: "]
117                + [str(gruel) for gruel in gruels]
118            )
119        )
120        return gruels

Load scraper classes that inherit from Gruel.

NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape() method is called.

:params:

directory: The path to scan for scraper classes.

class_names: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

include_patterns: Files that match these patterns will be scanned.

exclude_patterns: Files that match these patterns will not be scanned.

recursive: Whether the search should be recursive or not.

>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
def pop_modules(self):
122    def pop_modules(self):
123        """Unload modules."""
124        for module in self.modules:
125            del module
126        self._module_names = []

Unload modules.

def get_bases(self, object: Any) -> list[typing.Any]:
128    def get_bases(self, object: Any) -> list[Any]:
129        """Returns a recursive list of all the classes `object` inherits from."""
130        parents = []
131        bases = object.__bases__
132        if not bases:
133            return parents
134        for base in bases:
135            parents.append(base)
136            parents.extend(self.get_bases(base))
137        return parents

Returns a recursive list of all the classes object inherits from.

def is_subgruel(self, object: Any) -> bool:
139    def is_subgruel(self, object: Any) -> bool:
140        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
141        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
142            return False
143        return True

Returns whether object inherits from Gruel somewhere in its ancestory.

def prescrape_chores(self):
145    def prescrape_chores(self):
146        """Override to add any tasks to be done before running the scrapers."""
147        ...

Override to add any tasks to be done before running the scrapers.

def postscrape_chores(self):
149    def postscrape_chores(self):
150        """Override to add any tasks to be done after running the scrapers."""
151        self.pop_modules()

Override to add any tasks to be done after running the scrapers.

def scrape(self, scrapers: list[gruel.gruel.Gruel]):
153    def scrape(self, scrapers: list[Gruel]):
154        """Run the `scrape()` method for each scraper in `scrapers`.
155
156        Execution is multithreaded."""
157        execute = lambda scraper: scraper().scrape()
158        pool = quickpool.ThreadPool(
159            [execute] * len(scrapers), [(scraper,) for scraper in scrapers]
160        )
161        pool.execute()

Run the scrape() method for each scraper in scrapers.

Execution is multithreaded.

def logprint(self, message: str):
163    def logprint(self, message: str):
164        """Log and print `message`."""
165        self.logger.info(message)
166        print(message)

Log and print message.

def brew(self):
168    def brew(self):
169        """Execute pipeline.
170
171        1. self.prescrape_chores()
172        2. self.load_scrapers()
173        3. self.scrape()
174        4. self.postscrape_chores()"""
175
176        try:
177            self.logprint("Beginning brew")
178            # 1--------------------------------------------
179            self.logprint("Executing prescrape chores")
180            self.prescrape_chores()
181            # 2--------------------------------------------
182            self.logprint("Loading scrapers")
183            scrapers = self.load_scrapers()
184            print(f"Loaded {len(scrapers)} scrapers")
185            # 3--------------------------------------------
186            self.logprint("Starting scrape")
187            self.scrape(scrapers)
188            self.logprint("Scrape complete")
189            # 4--------------------------------------------
190            self.logprint("Executing postscrape chores")
191            self.postscrape_chores()
192            self.logprint("Brew complete")
193        except Exception as e:
194            print(e)
195            self.logger.exception("Exception occured during brew():")

Execute pipeline.

  1. self.prescrape_chores()
  2. self.load_scrapers()
  3. self.scrape()
  4. self.postscrape_chores()
def get_args() -> argparse.Namespace:
198def get_args() -> argparse.Namespace:
199    parser = argparse.ArgumentParser()
200
201    parser.add_argument(
202        "subgruel_classes",
203        type=str,
204        nargs="*",
205        help=""" A list of Gruel scraper class names to find and import. """,
206    )
207    parser.add_argument(
208        "-e",
209        "--excludes",
210        type=str,
211        nargs="*",
212        default=[],
213        help=""" A list of glob style file patterns to exclude from the scan. """,
214    )
215    parser.add_argument(
216        "-i",
217        "--includes",
218        type=str,
219        nargs="*",
220        default=["*.py"],
221        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
222    )
223    parser.add_argument(
224        "-p",
225        "--path",
226        type=str,
227        default=Pathier.cwd(),
228        help=""" The directory path to scan. Defaults to the current working directory. """,
229    )
230    parser.add_argument(
231        "-r",
232        "--recursive",
233        action="store_true",
234        help=""" Whether -p/--path should be scanned recursively or not. """,
235    )
236    parser.add_argument(
237        "-l",
238        "--log_dir",
239        type=str,
240        default=None,
241        help=""" The directory to save the brew log to.""",
242    )
243    args = parser.parse_args()
244    args.path = Pathier(args.path)
245
246    return args
def main(args: argparse.Namespace | None = None):
249def main(args: argparse.Namespace | None = None):
250    if not args:
251        args = get_args()
252    brewer = Brewer(
253        args.subgruel_classes,
254        args.excludes,
255        args.path,
256        args.includes,
257        args.recursive,
258        args.log_dir,
259    )
260    brewer.brew()