gruel.brewer

  1import argparse
  2import importlib
  3import importlib.machinery
  4import importlib.util
  5import inspect
  6import logging
  7import sys
  8from typing import Any
  9
 10import quickpool
 11from pathier import Pathier, Pathish
 12from younotyou import younotyou
 13
 14from gruel import Gruel
 15
 16
 17class Brewer:
 18    def __init__(
 19        self,
 20        subgruel_classes: list[str],
 21        file_exclude_patterns: list[str] = [],
 22        scan_path: Pathish = Pathier.cwd(),
 23        file_include_patterns: list[str] = ["*.py"],
 24        recursive: bool = True,
 25    ):
 26        """Run `Gruel` scrapers.
 27
 28        #### :params:
 29
 30        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 31        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 32
 33        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 34
 35        `scan_path`: The path to scan for scraper classes.
 36
 37        `file_include_patterns`: Files that match these patterns will be scanned.
 38
 39        `recursive`: Whether the scan should be recursive or not.
 40
 41        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 42        >>> brewer.brew()"""
 43        self._init_logger()
 44        self.subgruel_classes = subgruel_classes
 45        self.file_exclude_patterns = file_exclude_patterns
 46        self.file_include_patterns = file_include_patterns
 47        self.scan_path = Pathier(scan_path)
 48        self.recursive = recursive
 49
 50    def _init_logger(self):
 51        # When Brewer is subclassed, use that file's stem instead of `brewer`
 52        source_file = inspect.getsourcefile(type(self))
 53        if source_file:
 54            log_name = Pathier(source_file).stem
 55        else:
 56            log_name = Pathier(__file__).stem
 57        self.logger = logging.getLogger(log_name)
 58        if not self.logger.hasHandlers():
 59            handler = logging.FileHandler(log_name + ".log")
 60            handler.setFormatter(
 61                logging.Formatter(
 62                    "{levelname}|-|{asctime}|-|{message}",
 63                    style="{",
 64                    datefmt="%m/%d/%Y %I:%M:%S %p",
 65                )
 66            )
 67            self.logger.addHandler(handler)
 68            self.logger.setLevel(logging.INFO)
 69
 70    def load_scrapers(self) -> list[Gruel]:
 71        """Load scraper classes that inherit from `Gruel`.
 72
 73        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 74
 75        #### :params:
 76
 77        `directory`: The path to scan for scraper classes.
 78
 79        `class_names`: A list of class names for scrapers that should be loaded.
 80        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 81
 82        `include_patterns`: Files that match these patterns will be scanned.
 83
 84        `exclude_patterns`: Files that match these patterns will not be scanned.
 85
 86        `recursive`: Whether the search should be recursive or not.
 87
 88        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 89        globber = self.scan_path.glob
 90        if self.recursive:
 91            globber = self.scan_path.rglob
 92        files = [
 93            str(file)
 94            for pattern in self.file_include_patterns
 95            for file in globber(pattern)
 96        ]
 97        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 98        modules = []
 99        self._module_names = []
100        for file in files:
101            module_name = Pathier(file).stem
102            try:
103                module = importlib.machinery.SourceFileLoader(
104                    module_name, file
105                ).load_module()
106            except Exception as e:
107                self.logger.exception(
108                    f"Failed to load module '{module_name}' from '{file}'."
109                )
110            else:
111                self._module_names.append(module_name)
112                modules.append(module)
113        gruels = [
114            getattr(module, class_)
115            for module in modules
116            for class_ in self.subgruel_classes
117            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
118        ]
119        self.logger.info(
120            "\n".join(
121                [f"Imported {len(gruels)} scrapers: "]
122                + [str(gruel) for gruel in gruels]
123            )
124        )
125        return gruels
126
127    def pop_modules(self):
128        """Unload modules."""
129        for module in self._module_names:
130            sys.modules.pop(module)
131        self._module_names = []
132
133    def get_bases(self, object: Any) -> list[Any]:
134        """Returns a recursive list of all the classes `object` inherits from."""
135        parents = []
136        bases = object.__bases__
137        if not bases:
138            return parents
139        for base in bases:
140            parents.append(base)
141            parents.extend(self.get_bases(base))
142        return parents
143
144    def is_subgruel(self, object: Any) -> bool:
145        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
146        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
147            return False
148        return True
149
150    def prescrape_chores(self):
151        """Override to add any tasks to be done before running the scrapers."""
152        ...
153
154    def postscrape_chores(self):
155        """Override to add any tasks to be done after running the scrapers."""
156        self.pop_modules()
157
158    def scrape(self, scrapers: list[Gruel]):
159        """Run the `scrape()` method for each scraper in `scrapers`.
160
161        Execution is multithreaded."""
162        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
163        pool.execute()
164
165    def logprint(self, message: str):
166        """Log and print `message`."""
167        self.logger.info(message)
168        print(message)
169
170    def brew(self):
171        """Execute pipeline.
172
173        1. self.prescrape_chores()
174        2. self.load_scrapers()
175        3. self.scrape()
176        4. self.postscrape_chores()"""
177
178        try:
179            self.logprint("Beginning brew")
180            # 1--------------------------------------------
181            self.logprint("Executing prescrape chores")
182            self.prescrape_chores()
183            # 2--------------------------------------------
184            self.logprint("Loading scrapers")
185            scrapers = self.load_scrapers()
186            print(f"Loaded {len(scrapers)} scrapers")
187            # 3--------------------------------------------
188            self.logprint("Starting scrape")
189            self.scrape(scrapers)
190            self.logprint("Scrape complete")
191            # 4--------------------------------------------
192            self.logprint("Executing postscrape chores")
193            self.postscrape_chores()
194            self.logprint("Brew complete")
195        except Exception as e:
196            print(e)
197            self.logger.exception("Exception occured during brew():")
198
199
200def get_args() -> argparse.Namespace:
201    parser = argparse.ArgumentParser()
202
203    parser.add_argument(
204        "subgruel_classes",
205        type=str,
206        nargs="*",
207        help=""" A list of Gruel scraper class names to find and import. """,
208    )
209    parser.add_argument(
210        "-e",
211        "--excludes",
212        type=str,
213        nargs="*",
214        default=[],
215        help=""" A list of glob style file patterns to exclude from the scan. """,
216    )
217    parser.add_argument(
218        "-i",
219        "--includes",
220        type=str,
221        nargs="*",
222        default=["*.py"],
223        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
224    )
225    parser.add_argument(
226        "-p",
227        "--path",
228        type=str,
229        default=Pathier.cwd(),
230        help=""" The directory path to scan. Defaults to the current working directory. """,
231    )
232    parser.add_argument(
233        "-r",
234        "--recursive",
235        action="store_true",
236        help=""" Whether -p/--path should be scanned recursively or not. """,
237    )
238    args = parser.parse_args()
239    args.path = Pathier(args.path)
240
241    return args
242
243
244def main(args: argparse.Namespace | None = None):
245    if not args:
246        args = get_args()
247    brewer = Brewer(
248        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
249    )
250    brewer.brew()
251
252
253if __name__ == "__main__":
254    main(get_args())
class Brewer:
 18class Brewer:
 19    def __init__(
 20        self,
 21        subgruel_classes: list[str],
 22        file_exclude_patterns: list[str] = [],
 23        scan_path: Pathish = Pathier.cwd(),
 24        file_include_patterns: list[str] = ["*.py"],
 25        recursive: bool = True,
 26    ):
 27        """Run `Gruel` scrapers.
 28
 29        #### :params:
 30
 31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
 32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 33
 34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
 35
 36        `scan_path`: The path to scan for scraper classes.
 37
 38        `file_include_patterns`: Files that match these patterns will be scanned.
 39
 40        `recursive`: Whether the scan should be recursive or not.
 41
 42        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
 43        >>> brewer.brew()"""
 44        self._init_logger()
 45        self.subgruel_classes = subgruel_classes
 46        self.file_exclude_patterns = file_exclude_patterns
 47        self.file_include_patterns = file_include_patterns
 48        self.scan_path = Pathier(scan_path)
 49        self.recursive = recursive
 50
 51    def _init_logger(self):
 52        # When Brewer is subclassed, use that file's stem instead of `brewer`
 53        source_file = inspect.getsourcefile(type(self))
 54        if source_file:
 55            log_name = Pathier(source_file).stem
 56        else:
 57            log_name = Pathier(__file__).stem
 58        self.logger = logging.getLogger(log_name)
 59        if not self.logger.hasHandlers():
 60            handler = logging.FileHandler(log_name + ".log")
 61            handler.setFormatter(
 62                logging.Formatter(
 63                    "{levelname}|-|{asctime}|-|{message}",
 64                    style="{",
 65                    datefmt="%m/%d/%Y %I:%M:%S %p",
 66                )
 67            )
 68            self.logger.addHandler(handler)
 69            self.logger.setLevel(logging.INFO)
 70
 71    def load_scrapers(self) -> list[Gruel]:
 72        """Load scraper classes that inherit from `Gruel`.
 73
 74        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 75
 76        #### :params:
 77
 78        `directory`: The path to scan for scraper classes.
 79
 80        `class_names`: A list of class names for scrapers that should be loaded.
 81        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 82
 83        `include_patterns`: Files that match these patterns will be scanned.
 84
 85        `exclude_patterns`: Files that match these patterns will not be scanned.
 86
 87        `recursive`: Whether the search should be recursive or not.
 88
 89        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 90        globber = self.scan_path.glob
 91        if self.recursive:
 92            globber = self.scan_path.rglob
 93        files = [
 94            str(file)
 95            for pattern in self.file_include_patterns
 96            for file in globber(pattern)
 97        ]
 98        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 99        modules = []
100        self._module_names = []
101        for file in files:
102            module_name = Pathier(file).stem
103            try:
104                module = importlib.machinery.SourceFileLoader(
105                    module_name, file
106                ).load_module()
107            except Exception as e:
108                self.logger.exception(
109                    f"Failed to load module '{module_name}' from '{file}'."
110                )
111            else:
112                self._module_names.append(module_name)
113                modules.append(module)
114        gruels = [
115            getattr(module, class_)
116            for module in modules
117            for class_ in self.subgruel_classes
118            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
119        ]
120        self.logger.info(
121            "\n".join(
122                [f"Imported {len(gruels)} scrapers: "]
123                + [str(gruel) for gruel in gruels]
124            )
125        )
126        return gruels
127
128    def pop_modules(self):
129        """Unload modules."""
130        for module in self._module_names:
131            sys.modules.pop(module)
132        self._module_names = []
133
134    def get_bases(self, object: Any) -> list[Any]:
135        """Returns a recursive list of all the classes `object` inherits from."""
136        parents = []
137        bases = object.__bases__
138        if not bases:
139            return parents
140        for base in bases:
141            parents.append(base)
142            parents.extend(self.get_bases(base))
143        return parents
144
145    def is_subgruel(self, object: Any) -> bool:
146        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
147        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
148            return False
149        return True
150
151    def prescrape_chores(self):
152        """Override to add any tasks to be done before running the scrapers."""
153        ...
154
155    def postscrape_chores(self):
156        """Override to add any tasks to be done after running the scrapers."""
157        self.pop_modules()
158
159    def scrape(self, scrapers: list[Gruel]):
160        """Run the `scrape()` method for each scraper in `scrapers`.
161
162        Execution is multithreaded."""
163        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
164        pool.execute()
165
166    def logprint(self, message: str):
167        """Log and print `message`."""
168        self.logger.info(message)
169        print(message)
170
171    def brew(self):
172        """Execute pipeline.
173
174        1. self.prescrape_chores()
175        2. self.load_scrapers()
176        3. self.scrape()
177        4. self.postscrape_chores()"""
178
179        try:
180            self.logprint("Beginning brew")
181            # 1--------------------------------------------
182            self.logprint("Executing prescrape chores")
183            self.prescrape_chores()
184            # 2--------------------------------------------
185            self.logprint("Loading scrapers")
186            scrapers = self.load_scrapers()
187            print(f"Loaded {len(scrapers)} scrapers")
188            # 3--------------------------------------------
189            self.logprint("Starting scrape")
190            self.scrape(scrapers)
191            self.logprint("Scrape complete")
192            # 4--------------------------------------------
193            self.logprint("Executing postscrape chores")
194            self.postscrape_chores()
195            self.logprint("Brew complete")
196        except Exception as e:
197            print(e)
198            self.logger.exception("Exception occured during brew():")
Brewer( subgruel_classes: list[str], file_exclude_patterns: list[str] = [], scan_path: pathier.pathier.Pathier | pathlib.Path | str = WindowsPath('E:/1vsCode/python/gruel'), file_include_patterns: list[str] = ['*.py'], recursive: bool = True)
19    def __init__(
20        self,
21        subgruel_classes: list[str],
22        file_exclude_patterns: list[str] = [],
23        scan_path: Pathish = Pathier.cwd(),
24        file_include_patterns: list[str] = ["*.py"],
25        recursive: bool = True,
26    ):
27        """Run `Gruel` scrapers.
28
29        #### :params:
30
31        `subgruel_classes`: A list of class names for scrapers that should be loaded.
32        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
33
34        `file_exclude_patterns`: Files that match these patterns will not be scanned.
35
36        `scan_path`: The path to scan for scraper classes.
37
38        `file_include_patterns`: Files that match these patterns will be scanned.
39
40        `recursive`: Whether the scan should be recursive or not.
41
42        >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
43        >>> brewer.brew()"""
44        self._init_logger()
45        self.subgruel_classes = subgruel_classes
46        self.file_exclude_patterns = file_exclude_patterns
47        self.file_include_patterns = file_include_patterns
48        self.scan_path = Pathier(scan_path)
49        self.recursive = recursive

Run Gruel scrapers.

:params:

subgruel_classes: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

file_exclude_patterns: Files that match these patterns will not be scanned.

scan_path: The path to scan for scraper classes.

file_include_patterns: Files that match these patterns will be scanned.

recursive: Whether the scan should be recursive or not.

>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
def load_scrapers(self) -> list[gruel.gruel.Gruel]:
 71    def load_scrapers(self) -> list[Gruel]:
 72        """Load scraper classes that inherit from `Gruel`.
 73
 74        NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called.
 75
 76        #### :params:
 77
 78        `directory`: The path to scan for scraper classes.
 79
 80        `class_names`: A list of class names for scrapers that should be loaded.
 81        In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy.
 82
 83        `include_patterns`: Files that match these patterns will be scanned.
 84
 85        `exclude_patterns`: Files that match these patterns will not be scanned.
 86
 87        `recursive`: Whether the search should be recursive or not.
 88
 89        >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])"""
 90        globber = self.scan_path.glob
 91        if self.recursive:
 92            globber = self.scan_path.rglob
 93        files = [
 94            str(file)
 95            for pattern in self.file_include_patterns
 96            for file in globber(pattern)
 97        ]
 98        files = younotyou(files, exclude_patterns=self.file_exclude_patterns)
 99        modules = []
100        self._module_names = []
101        for file in files:
102            module_name = Pathier(file).stem
103            try:
104                module = importlib.machinery.SourceFileLoader(
105                    module_name, file
106                ).load_module()
107            except Exception as e:
108                self.logger.exception(
109                    f"Failed to load module '{module_name}' from '{file}'."
110                )
111            else:
112                self._module_names.append(module_name)
113                modules.append(module)
114        gruels = [
115            getattr(module, class_)
116            for module in modules
117            for class_ in self.subgruel_classes
118            if class_ in dir(module) and self.is_subgruel(getattr(module, class_))
119        ]
120        self.logger.info(
121            "\n".join(
122                [f"Imported {len(gruels)} scrapers: "]
123                + [str(gruel) for gruel in gruels]
124            )
125        )
126        return gruels

Load scraper classes that inherit from Gruel.

NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape() method is called.

:params:

directory: The path to scan for scraper classes.

class_names: A list of class names for scrapers that should be loaded. In order to be loaded, a scraper class must have a name in this list and have Gruel somewhere in its inheritance hierarchy.

include_patterns: Files that match these patterns will be scanned.

exclude_patterns: Files that match these patterns will not be scanned.

recursive: Whether the search should be recursive or not.

>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
def pop_modules(self):
128    def pop_modules(self):
129        """Unload modules."""
130        for module in self._module_names:
131            sys.modules.pop(module)
132        self._module_names = []

Unload modules.

def get_bases(self, object: Any) -> list[typing.Any]:
134    def get_bases(self, object: Any) -> list[Any]:
135        """Returns a recursive list of all the classes `object` inherits from."""
136        parents = []
137        bases = object.__bases__
138        if not bases:
139            return parents
140        for base in bases:
141            parents.append(base)
142            parents.extend(self.get_bases(base))
143        return parents

Returns a recursive list of all the classes object inherits from.

def is_subgruel(self, object: Any) -> bool:
145    def is_subgruel(self, object: Any) -> bool:
146        """Returns whether `object` inherits from `Gruel` somewhere in its ancestory."""
147        if not inspect.isclass(object) or Gruel not in self.get_bases(object):
148            return False
149        return True

Returns whether object inherits from Gruel somewhere in its ancestory.

def prescrape_chores(self):
151    def prescrape_chores(self):
152        """Override to add any tasks to be done before running the scrapers."""
153        ...

Override to add any tasks to be done before running the scrapers.

def postscrape_chores(self):
155    def postscrape_chores(self):
156        """Override to add any tasks to be done after running the scrapers."""
157        self.pop_modules()

Override to add any tasks to be done after running the scrapers.

def scrape(self, scrapers: list[gruel.gruel.Gruel]):
159    def scrape(self, scrapers: list[Gruel]):
160        """Run the `scrape()` method for each scraper in `scrapers`.
161
162        Execution is multithreaded."""
163        pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers])  # type: ignore
164        pool.execute()

Run the scrape() method for each scraper in scrapers.

Execution is multithreaded.

def logprint(self, message: str):
166    def logprint(self, message: str):
167        """Log and print `message`."""
168        self.logger.info(message)
169        print(message)

Log and print message.

def brew(self):
171    def brew(self):
172        """Execute pipeline.
173
174        1. self.prescrape_chores()
175        2. self.load_scrapers()
176        3. self.scrape()
177        4. self.postscrape_chores()"""
178
179        try:
180            self.logprint("Beginning brew")
181            # 1--------------------------------------------
182            self.logprint("Executing prescrape chores")
183            self.prescrape_chores()
184            # 2--------------------------------------------
185            self.logprint("Loading scrapers")
186            scrapers = self.load_scrapers()
187            print(f"Loaded {len(scrapers)} scrapers")
188            # 3--------------------------------------------
189            self.logprint("Starting scrape")
190            self.scrape(scrapers)
191            self.logprint("Scrape complete")
192            # 4--------------------------------------------
193            self.logprint("Executing postscrape chores")
194            self.postscrape_chores()
195            self.logprint("Brew complete")
196        except Exception as e:
197            print(e)
198            self.logger.exception("Exception occured during brew():")

Execute pipeline.

  1. self.prescrape_chores()
  2. self.load_scrapers()
  3. self.scrape()
  4. self.postscrape_chores()
def get_args() -> argparse.Namespace:
201def get_args() -> argparse.Namespace:
202    parser = argparse.ArgumentParser()
203
204    parser.add_argument(
205        "subgruel_classes",
206        type=str,
207        nargs="*",
208        help=""" A list of Gruel scraper class names to find and import. """,
209    )
210    parser.add_argument(
211        "-e",
212        "--excludes",
213        type=str,
214        nargs="*",
215        default=[],
216        help=""" A list of glob style file patterns to exclude from the scan. """,
217    )
218    parser.add_argument(
219        "-i",
220        "--includes",
221        type=str,
222        nargs="*",
223        default=["*.py"],
224        help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """,
225    )
226    parser.add_argument(
227        "-p",
228        "--path",
229        type=str,
230        default=Pathier.cwd(),
231        help=""" The directory path to scan. Defaults to the current working directory. """,
232    )
233    parser.add_argument(
234        "-r",
235        "--recursive",
236        action="store_true",
237        help=""" Whether -p/--path should be scanned recursively or not. """,
238    )
239    args = parser.parse_args()
240    args.path = Pathier(args.path)
241
242    return args
def main(args: argparse.Namespace | None = None):
245def main(args: argparse.Namespace | None = None):
246    if not args:
247        args = get_args()
248    brewer = Brewer(
249        args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive
250    )
251    brewer.brew()