gruel.brewer

  1import argparse
  2import importlib
  3import importlib.machinery
  4import importlib.util
  5import inspect
  6import sys
  7from typing import Any
  8
  9import loggi
 10import quickpool
 11from pathier import Pathier, Pathish
 12from younotyou import younotyou
 13
 14from gruel import Gruel
 15
 16
 17class Brewer:
 18    def __init__(
 19        self,
 20        subgruel_classes: list[str],
 21        file_exclude_patterns: list[str] = [],
 22        scan_path: Pathish = Pathier.cwd(),
 23        file_include_patterns: list[str] = ["*.py"],
 24        recursive: bool = True,
 25    ):
 26        """Run `Gruel` scrapers.
 27
 28        #### :params:
 29
 30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 32
 33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 34
 35        `scan_path`: The path to scan for scraper classes.
 36
 37        `file_include_patterns`: Files that match these patterns will be scanned.
 38
 39        `recursive`: Whether the scan should be recursive or not.
 40
 41        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 42        >>> brewer.brew()"""
 43        self._init_logger()
 44        self.subgruel_classes = subgruel_classes
 45        self.file_exclude_patterns = file_exclude_patterns
 46        self.file_include_patterns = file_include_patterns
 47        self.scan_path = Pathier(scan_path)
 48        self.recursive = recursive
 49
 50    def _init_logger(self):
 51        # When Brewer is subclassed, use that file's stem instead of `brewer`
 52        source_file = inspect.getsourcefile(type(self))
 53        if source_file:
 54            log_name = Pathier(source_file).stem
 55        else:
 56            log_name = Pathier(__file__).stem
 57        self.logger = loggi.getLogger(log_name)
 58
 59    def load_scrapers(self) -> list[Gruel]:
 60        """Load scraper classes that inherit from `Gruel`.
 61
 62        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 63
 64        #### :params:
 65
 66        `directory`: The path to scan for scraper classes.
 67
 68        `class_names`: A list of class names for scrapers that should be loaded.
 69        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 70
 71        `include_patterns`: Files that match these patterns will be scanned.
 72
 73        `exclude_patterns`: Files that match these patterns will not be scanned.
 74
 75        `recursive`: Whether the search should be recursive or not.
 76
 77        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 78        globber = self.scan_path.glob
 79        if self.recursive:
 80            globber = self.scan_path.rglob
 81        files = [
 82            str(file)
 83            for pattern in self.file_include_patterns
 84            for file in globber(pattern)
 85        ]
 86        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 87        modules = []
 88        self._module_names = []
 89        for file in files:
 90            module_name = Pathier(file).stem
 91            try:
 92                module = importlib.machinery.SourceFileLoader(
 93                    module_name, file
 94                ).load_module()
 95            except Exception as e:
 96                self.logger.exception(
 97                    f"Failed to load module '{module_name}' from '{file}'."
 98                )
 99            else:
100                self._module_names.append(module_name)
101                modules.append(module)
102        gruels = [
103            getattr(module, class_)
104            for module in modules
105            for class_ in self.subgruel_classes
106            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
107        ]
108        self.logger.info(
109            "\n".join(
110                [f"Imported {len(gruels)} scrapers: "]
111                + [str(gruel) for gruel in gruels]
112            )
113        )
114        return gruels
115
116    def pop_modules(self):
117        """Unload modules."""
118        for module in self._module_names:
119            sys.modules.pop(module)
120        self._module_names = []
121
122    def get_bases(self, object: Any) -> list[Any]:
123        """Returns a recursive list of all the classes `object` inherits from."""
124        parents = []
125        bases = object.__bases__
126        if not bases:
127            return parents
128        for base in bases:
129            parents.append(base)
130            parents.extend(self.get_bases(base))
131        return parents
132
133    def is_subgruel(self, object: Any) -> bool:
134        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
135        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
136            return False
137        return True
138
139    def prescrape_chores(self):
140        """Override to add any tasks to be done before running the scrapers."""
141        ...
142
143    def postscrape_chores(self):
144        """Override to add any tasks to be done after running the scrapers."""
145        self.pop_modules()
146
147    def scrape(self, scrapers: list[Gruel]):
148        """Run the `scrape()` method for each scraper in `scrapers`.
149
150        Execution is multithreaded."""
151        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
152        pool.execute()
153
154    def logprint(self, message: str):
155        """Log and print `message`."""
156        self.logger.info(message)
157        print(message)
158
159    def brew(self):
160        """Execute pipeline.
161
162        1. self.prescrape_chores()
163        2. self.load_scrapers()
164        3. self.scrape()
165        4. self.postscrape_chores()"""
166
167        try:
168            self.logprint("Beginning brew")
169            # 1--------------------------------------------
170            self.logprint("Executing prescrape chores")
171            self.prescrape_chores()
172            # 2--------------------------------------------
173            self.logprint("Loading scrapers")
174            scrapers = self.load_scrapers()
175            print(f"Loaded {len(scrapers)} scrapers")
176            # 3--------------------------------------------
177            self.logprint("Starting scrape")
178            self.scrape(scrapers)
179            self.logprint("Scrape complete")
180            # 4--------------------------------------------
181            self.logprint("Executing postscrape chores")
182            self.postscrape_chores()
183            self.logprint("Brew complete")
184        except Exception as e:
185            print(e)
186            self.logger.exception("Exception occured during brew():")
187
188
189def get_args() -> argparse.Namespace:
190    parser = argparse.ArgumentParser()
191
192    parser.add_argument(
193        "subgruel_classes",
194        type=str,
195        nargs="*",
196        help=""" A list of Gruel scraper class names to find and import. """,
197    )
198    parser.add_argument(
199        "-e",
200        "--excludes",
201        type=str,
202        nargs="*",
203        default=[],
204        help=""" A list of glob style file patterns to exclude from the scan. """,
205    )
206    parser.add_argument(
207        "-i",
208        "--includes",
209        type=str,
210        nargs="*",
211        default=["*.py"],
212        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
213    )
214    parser.add_argument(
215        "-p",
216        "--path",
217        type=str,
218        default=Pathier.cwd(),
219        help=""" The directory path to scan. Defaults to the current working directory. """,
220    )
221    parser.add_argument(
222        "-r",
223        "--recursive",
224        action="store_true",
225        help=""" Whether -p/--path should be scanned recursively or not. """,
226    )
227    args = parser.parse_args()
228    args.path = Pathier(args.path)
229
230    return args
231
232
233def main(args: argparse.Namespace | None = None):
234    if not args:
235        args = get_args()
236    brewer = Brewer(
237        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
238    )
239    brewer.brew()
240
241
242if __name__ == "__main__":
243    main(get_args())
class Brewer:
 18class Brewer:
 19    def __init__(
 20        self,
 21        subgruel_classes: list[str],
 22        file_exclude_patterns: list[str] = [],
 23        scan_path: Pathish = Pathier.cwd(),
 24        file_include_patterns: list[str] = ["*.py"],
 25        recursive: bool = True,
 26    ):
 27        """Run `Gruel` scrapers.
 28
 29        #### :params:
 30
 31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 33
 34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 35
 36        `scan_path`: The path to scan for scraper classes.
 37
 38        `file_include_patterns`: Files that match these patterns will be scanned.
 39
 40        `recursive`: Whether the scan should be recursive or not.
 41
 42        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 43        >>> brewer.brew()"""
 44        self._init_logger()
 45        self.subgruel_classes = subgruel_classes
 46        self.file_exclude_patterns = file_exclude_patterns
 47        self.file_include_patterns = file_include_patterns
 48        self.scan_path = Pathier(scan_path)
 49        self.recursive = recursive
 50
 51    def _init_logger(self):
 52        # When Brewer is subclassed, use that file's stem instead of `brewer`
 53        source_file = inspect.getsourcefile(type(self))
 54        if source_file:
 55            log_name = Pathier(source_file).stem
 56        else:
 57            log_name = Pathier(__file__).stem
 58        self.logger = loggi.getLogger(log_name)
 59
 60    def load_scrapers(self) -> list[Gruel]:
 61        """Load scraper classes that inherit from `Gruel`.
 62
 63        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 64
 65        #### :params:
 66
 67        `directory`: The path to scan for scraper classes.
 68
 69        `class_names`: A list of class names for scrapers that should be loaded.
 70        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 71
 72        `include_patterns`: Files that match these patterns will be scanned.
 73
 74        `exclude_patterns`: Files that match these patterns will not be scanned.
 75
 76        `recursive`: Whether the search should be recursive or not.
 77
 78        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 79        globber = self.scan_path.glob
 80        if self.recursive:
 81            globber = self.scan_path.rglob
 82        files = [
 83            str(file)
 84            for pattern in self.file_include_patterns
 85            for file in globber(pattern)
 86        ]
 87        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 88        modules = []
 89        self._module_names = []
 90        for file in files:
 91            module_name = Pathier(file).stem
 92            try:
 93                module = importlib.machinery.SourceFileLoader(
 94                    module_name, file
 95                ).load_module()
 96            except Exception as e:
 97                self.logger.exception(
 98                    f"Failed to load module '{module_name}' from '{file}'."
 99                )
100            else:
101                self._module_names.append(module_name)
102                modules.append(module)
103        gruels = [
104            getattr(module, class_)
105            for module in modules
106            for class_ in self.subgruel_classes
107            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
108        ]
109        self.logger.info(
110            "\n".join(
111                [f"Imported {len(gruels)} scrapers: "]
112                + [str(gruel) for gruel in gruels]
113            )
114        )
115        return gruels
116
117    def pop_modules(self):
118        """Unload modules."""
119        for module in self._module_names:
120            sys.modules.pop(module)
121        self._module_names = []
122
123    def get_bases(self, object: Any) -> list[Any]:
124        """Returns a recursive list of all the classes `object` inherits from."""
125        parents = []
126        bases = object.__bases__
127        if not bases:
128            return parents
129        for base in bases:
130            parents.append(base)
131            parents.extend(self.get_bases(base))
132        return parents
133
134    def is_subgruel(self, object: Any) -> bool:
135        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
136        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
137            return False
138        return True
139
140    def prescrape_chores(self):
141        """Override to add any tasks to be done before running the scrapers."""
142        ...
143
144    def postscrape_chores(self):
145        """Override to add any tasks to be done after running the scrapers."""
146        self.pop_modules()
147
148    def scrape(self, scrapers: list[Gruel]):
149        """Run the `scrape()` method for each scraper in `scrapers`.
150
151        Execution is multithreaded."""
152        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
153        pool.execute()
154
155    def logprint(self, message: str):
156        """Log and print `message`."""
157        self.logger.info(message)
158        print(message)
159
160    def brew(self):
161        """Execute pipeline.
162
163        1. self.prescrape_chores()
164        2. self.load_scrapers()
165        3. self.scrape()
166        4. self.postscrape_chores()"""
167
168        try:
169            self.logprint("Beginning brew")
170            # 1--------------------------------------------
171            self.logprint("Executing prescrape chores")
172            self.prescrape_chores()
173            # 2--------------------------------------------
174            self.logprint("Loading scrapers")
175            scrapers = self.load_scrapers()
176            print(f"Loaded {len(scrapers)} scrapers")
177            # 3--------------------------------------------
178            self.logprint("Starting scrape")
179            self.scrape(scrapers)
180            self.logprint("Scrape complete")
181            # 4--------------------------------------------
182            self.logprint("Executing postscrape chores")
183            self.postscrape_chores()
184            self.logprint("Brew complete")
185        except Exception as e:
186            print(e)
187            self.logger.exception("Exception occured during brew():")
Brewer( subgruel_classes: list[str], file_exclude_patterns: list[str] = [], scan_path: pathier.pathier.Pathier | pathlib.Path | str = WindowsPath('E:/1vsCode/python/gruel'), file_include_patterns: list[str] = ['*.py'], recursive: bool = True)
19    def __init__(
20        self,
21        subgruel_classes: list[str],
22        file_exclude_patterns: list[str] = [],
23        scan_path: Pathish = Pathier.cwd(),
24        file_include_patterns: list[str] = ["*.py"],
25        recursive: bool = True,
26    ):
27        """Run `Gruel` scrapers.
28
29        #### :params:
30
31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
33
34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
35
36        `scan_path`: The path to scan for scraper classes.
37
38        `file_include_patterns`: Files that match these patterns will be scanned.
39
40        `recursive`: Whether the scan should be recursive or not.
41
42        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
43        >>> brewer.brew()"""
44        self._init_logger()
45        self.subgruel_classes = subgruel_classes
46        self.file_exclude_patterns = file_exclude_patterns
47        self.file_include_patterns = file_include_patterns
48        self.scan_path = Pathier(scan_path)
49        self.recursive = recursive

Run Gruel scrapers.

:params:

subgruel_classes: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

file_exclude_patterns: Files that match these patterns will not be scanned.

scan_path: The path to scan for scraper classes.

file_include_patterns: Files that match these patterns will be scanned.

recursive: Whether the scan should be recursive or not.

>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
def load_scrapers(self) -> list[gruel.gruel.Gruel]:
 60    def load_scrapers(self) -> list[Gruel]:
 61        """Load scraper classes that inherit from `Gruel`.
 62
 63        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 64
 65        #### :params:
 66
 67        `directory`: The path to scan for scraper classes.
 68
 69        `class_names`: A list of class names for scrapers that should be loaded.
 70        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 71
 72        `include_patterns`: Files that match these patterns will be scanned.
 73
 74        `exclude_patterns`: Files that match these patterns will not be scanned.
 75
 76        `recursive`: Whether the search should be recursive or not.
 77
 78        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 79        globber = self.scan_path.glob
 80        if self.recursive:
 81            globber = self.scan_path.rglob
 82        files = [
 83            str(file)
 84            for pattern in self.file_include_patterns
 85            for file in globber(pattern)
 86        ]
 87        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 88        modules = []
 89        self._module_names = []
 90        for file in files:
 91            module_name = Pathier(file).stem
 92            try:
 93                module = importlib.machinery.SourceFileLoader(
 94                    module_name, file
 95                ).load_module()
 96            except Exception as e:
 97                self.logger.exception(
 98                    f"Failed to load module '{module_name}' from '{file}'."
 99                )
100            else:
101                self._module_names.append(module_name)
102                modules.append(module)
103        gruels = [
104            getattr(module, class_)
105            for module in modules
106            for class_ in self.subgruel_classes
107            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
108        ]
109        self.logger.info(
110            "\n".join(
111                [f"Imported {len(gruels)} scrapers: "]
112                + [str(gruel) for gruel in gruels]
113            )
114        )
115        return gruels

Load scraper classes that inherit from Gruel.

NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape() method is called.

:params:

directory: The path to scan for scraper classes.

class_names: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

include_patterns: Files that match these patterns will be scanned.

exclude_patterns: Files that match these patterns will not be scanned.

recursive: Whether the search should be recursive or not.

>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
def pop_modules(self):
117    def pop_modules(self):
118        """Unload modules."""
119        for module in self._module_names:
120            sys.modules.pop(module)
121        self._module_names = []

Unload modules.

def get_bases(self, object: Any) -> list[typing.Any]:
123    def get_bases(self, object: Any) -> list[Any]:
124        """Returns a recursive list of all the classes `object` inherits from."""
125        parents = []
126        bases = object.__bases__
127        if not bases:
128            return parents
129        for base in bases:
130            parents.append(base)
131            parents.extend(self.get_bases(base))
132        return parents

Returns a recursive list of all the classes object inherits from.

def is_subgruel(self, object: Any) -> bool:
134    def is_subgruel(self, object: Any) -> bool:
135        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
136        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
137            return False
138        return True

Returns whether object inherits from Gruel somewhere in its ancestory.

def prescrape_chores(self):
140    def prescrape_chores(self):
141        """Override to add any tasks to be done before running the scrapers."""
142        ...

Override to add any tasks to be done before running the scrapers.

def postscrape_chores(self):
144    def postscrape_chores(self):
145        """Override to add any tasks to be done after running the scrapers."""
146        self.pop_modules()

Override to add any tasks to be done after running the scrapers.

def scrape(self, scrapers: list[gruel.gruel.Gruel]):
148    def scrape(self, scrapers: list[Gruel]):
149        """Run the `scrape()` method for each scraper in `scrapers`.
150
151        Execution is multithreaded."""
152        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
153        pool.execute()

Run the scrape() method for each scraper in scrapers.

Execution is multithreaded.

def logprint(self, message: str):
155    def logprint(self, message: str):
156        """Log and print `message`."""
157        self.logger.info(message)
158        print(message)

Log and print message.

def brew(self):
160    def brew(self):
161        """Execute pipeline.
162
163        1. self.prescrape_chores()
164        2. self.load_scrapers()
165        3. self.scrape()
166        4. self.postscrape_chores()"""
167
168        try:
169            self.logprint("Beginning brew")
170            # 1--------------------------------------------
171            self.logprint("Executing prescrape chores")
172            self.prescrape_chores()
173            # 2--------------------------------------------
174            self.logprint("Loading scrapers")
175            scrapers = self.load_scrapers()
176            print(f"Loaded {len(scrapers)} scrapers")
177            # 3--------------------------------------------
178            self.logprint("Starting scrape")
179            self.scrape(scrapers)
180            self.logprint("Scrape complete")
181            # 4--------------------------------------------
182            self.logprint("Executing postscrape chores")
183            self.postscrape_chores()
184            self.logprint("Brew complete")
185        except Exception as e:
186            print(e)
187            self.logger.exception("Exception occured during brew():")

Execute pipeline.

  1. self.prescrape_chores()
  2. self.load_scrapers()
  3. self.scrape()
  4. self.postscrape_chores()
def get_args() -> argparse.Namespace:
190def get_args() -> argparse.Namespace:
191    parser = argparse.ArgumentParser()
192
193    parser.add_argument(
194        "subgruel_classes",
195        type=str,
196        nargs="*",
197        help=""" A list of Gruel scraper class names to find and import. """,
198    )
199    parser.add_argument(
200        "-e",
201        "--excludes",
202        type=str,
203        nargs="*",
204        default=[],
205        help=""" A list of glob style file patterns to exclude from the scan. """,
206    )
207    parser.add_argument(
208        "-i",
209        "--includes",
210        type=str,
211        nargs="*",
212        default=["*.py"],
213        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
214    )
215    parser.add_argument(
216        "-p",
217        "--path",
218        type=str,
219        default=Pathier.cwd(),
220        help=""" The directory path to scan. Defaults to the current working directory. """,
221    )
222    parser.add_argument(
223        "-r",
224        "--recursive",
225        action="store_true",
226        help=""" Whether -p/--path should be scanned recursively or not. """,
227    )
228    args = parser.parse_args()
229    args.path = Pathier(args.path)
230
231    return args
def main(args: argparse.Namespace | None = None):
234def main(args: argparse.Namespace | None = None):
235    if not args:
236        args = get_args()
237    brewer = Brewer(
238        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
239    )
240    brewer.brew()