gruel.brewer
1import argparse 2import importlib 3import importlib.machinery 4import importlib.util 5import inspect 6from types import ModuleType 7from typing import Any, Sequence, Type 8 9import loggi 10import quickpool 11from pathier import Pathier, Pathish 12from younotyou import Matcher, younotyou 13 14from gruel.grueler import Gruel 15 16 17class GruelFinder: 18 """Find and load classes that subclass `Gruel`.""" 19 20 def __init__( 21 self, 22 subgruel_classes: list[str] = ["*"], 23 file_exclude_patterns: list[str] = [], 24 scan_path: Pathier | None = None, 25 file_include_patterns: list[str] = ["*.py"], 26 recursive: bool = True, 27 log_dir: Pathish | None = None, 28 ): 29 """#### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 Can use wildcard ('*') patterns for matching. 34 35 `file_exclude_patterns`: Files that match these patterns will not be scanned. 36 37 `scan_path`: The path to scan for scraper classes. 38 39 `file_include_patterns`: Files that match these patterns will be scanned. 40 41 `recursive`: Whether the scan should be recursive or not. 42 43 `log_dir`: The directory this instance's log should be saved to. 44 If `None`, it will be saved to the current working directory. 45 46 Will find and load all classes in the "scrapers" directory that inherit from `Gruel` 47 and start with "MySubGruel", but don't contain "Scratch" in the name: 48 >>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers") 49 >>> gruels = finder.find()""" 50 self.subgruel_classes = subgruel_classes 51 self.file_exclude_patterns = file_exclude_patterns 52 self.scan_path = scan_path or Pathier.cwd() 53 self.file_include_patterns = file_include_patterns 54 self.recursive = recursive 55 self.logger = loggi.getLogger( 56 "gruel_finder", Pathier(log_dir) if log_dir else Pathier.cwd() 57 ) 58 59 def get_bases(self, object: Any) -> list[Any]: 60 """Returns a recursive list of all the classes `object` inherits from.""" 61 parents = [] 62 bases = object.__bases__ 63 if not bases: 64 return parents 65 for base in bases: 66 parents.append(base) 67 parents.extend(self.get_bases(base)) 68 return parents 69 70 def is_subgruel(self, object: Any) -> bool: 71 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 72 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 73 return False 74 return True 75 76 def glob_files(self) -> list[Pathier]: 77 """Search `self.scan_path` for files according to `self.file_include_patterns` and `self.file_exclude_patterns`. 78 79 Returns the file list.""" 80 globber = self.scan_path.rglob if self.recursive else self.scan_path.glob 81 files = [ 82 str(file) 83 for pattern in self.file_include_patterns 84 for file in globber(pattern) 85 ] 86 files = [ 87 Pathier(file) 88 for file in younotyou(files, exclude_patterns=self.file_exclude_patterns) 89 ] 90 return files 91 92 def load_module_from_file(self, file: Pathier) -> ModuleType | None: 93 """Attempts to load and return a module from `file`.""" 94 module_name = file.stem 95 try: 96 module = importlib.machinery.SourceFileLoader( 97 module_name, str(file) 98 ).load_module() 99 self.logger.info(f"Successfully imported `{module_name}` from `{file}`.") 100 return module 101 except Exception as e: 102 self.logger.exception(f"Failed to load `{module_name}` from `{file}`.") 103 104 def strain_for_gruel(self, modules: list[ModuleType]) -> list[Type[Gruel]]: 105 """Searches `modules` for classes that inherit from `Gruel` and are in `self.subgruel_classes`. 106 107 Returns the list of classes.""" 108 matcher = Matcher(self.subgruel_classes) 109 return [ 110 getattr(module, class_) 111 for module in modules 112 for class_ in dir(module) 113 if class_ in matcher and self.is_subgruel(getattr(module, class_)) 114 ] 115 116 def find(self) -> list[Type[Gruel]]: 117 """Run the scan and return `Gruel` subclasses.""" 118 files = self.glob_files() 119 modules = [] 120 for file in files: 121 if module := self.load_module_from_file(file): 122 modules.append(module) 123 return self.strain_for_gruel(modules) 124 125 126class Brewer: 127 """Use to do multithreaded execution of a list of scrapers. 128 129 Intended to be used with `Gruel` scrapers, but anything with a `scrape` method can be passed. 130 131 To run any `Gruel` scrapers from the current directory: 132 >>> Brewer(GruelFinder().find()).brew() 133 134 The `prescrape_chores` and `postscrape_chores` can be set/overridden like the same methods in `Gruel`. 135 136 When calling the `brew` method they will be executed once before and after all the scrapers have been executed. 137 138 i.e. 139 >>> brewer = Brewer(GruelFinder().find()) 140 >>> brewer.prescrape_chores() 141 >>> results = brewer.scrape() 142 >>> brewer.postscrape_chores() 143 144 is equivalent to 145 >>> results = Brewer(GruelFinder().find()).brew() 146 147 except `brew()` has some logging.""" 148 149 def __init__( 150 self, 151 scrapers: Sequence[Any], 152 scraper_args: Sequence[Sequence[Any]] = [], 153 scraper_kwargs: Sequence[dict[str, Any]] = [], 154 log_dir: Pathish | None = None, 155 ): 156 """#### :params: 157 158 `scrapers`: A list of scraper classes to initialize and execute. 159 A scraper should not be instantiated before being passed. 160 When `Brewer` runs a scraper it will instantiate the object at execution time and call it's `scrape` method. 161 162 `scraper_args`: A list where each element is a list of positional arguments to be passed to the corresponding scraper's `__init__` function. 163 164 `scraper_kwargs`: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's `__init__` function. 165 166 `log_dir`: The directory to store `Brewer` logs in. Defaults to the current working directory. 167 168 e.g. 169 >>> class MyGruel(Gruel): 170 >>> def __init__(self, value:int): 171 >>> super().__init__() 172 >>> self.value = value 173 >>> 174 >>> def scrape(self)->int: 175 >>> return self.value 176 >>> 177 >>> num_scrapers = 5 178 >>> values = list(range(5)) 179 >>> brewer = Brewer( 180 >>> [MyGruel]*num_scrapers, 181 >>> [(val,) for val in values] 182 >>> results = brewer.brew() 183 >>> print(results) 184 >>> [0, 1, 2, 3, 4]""" 185 self._init_logger(log_dir) 186 self.scrapers = scrapers 187 num_scrapers = len(self.scrapers) 188 # Pad args and kwargs if there aren't any given 189 self.scraper_args = scraper_args or [[]] * num_scrapers 190 self.scraper_kwargs = scraper_kwargs or [{}] * num_scrapers 191 192 def _init_logger(self, log_dir: Pathish | None = None): 193 # When Brewer is subclassed, use that file's stem instead of `brewer` 194 log_dir = Pathier(log_dir) if log_dir else Pathier.cwd() 195 source_file = inspect.getsourcefile(type(self)) 196 if source_file: 197 log_name = Pathier(source_file).stem 198 else: 199 log_name = Pathier(__file__).stem 200 self.logger = loggi.getLogger(log_name, log_dir) 201 202 def prescrape_chores(self): 203 """Override to add any tasks to be done before running the scrapers.""" 204 ... 205 206 def postscrape_chores(self): 207 """Override to add any tasks to be done after running the scrapers.""" 208 ... 209 210 def _prep_scrapers(self) -> list[tuple[Any, Sequence[Any], dict[str, Any]]]: 211 return [ 212 (scraper, args, kwargs) 213 for scraper, args, kwargs in zip( 214 self.scrapers, self.scraper_args, self.scraper_kwargs 215 ) 216 ] 217 218 def scrape(self) -> list[Any]: 219 """Run the `scrape()` method for each scraper in `scrapers`. 220 221 Execution is multithreaded.""" 222 223 def execute(scraper, args, kwargs): 224 return scraper(*args, **kwargs).scrape() 225 226 pool = quickpool.ThreadPool( 227 [execute] * len(self.scrapers), self._prep_scrapers() 228 ) 229 return pool.execute() 230 231 def brew(self) -> list[Any] | None: 232 """Execute pipeline. 233 234 1. self.prescrape_chores() 235 2. self.scrape() 236 3. self.postscrape_chores()""" 237 238 try: 239 self.logger.logprint("Beginning brew") 240 # 1-------------------------------------------- 241 self.logger.logprint("Executing prescrape chores") 242 self.prescrape_chores() 243 # 2-------------------------------------------- 244 self.logger.logprint("Starting scrape") 245 results = self.scrape() 246 self.logger.logprint("Scrape complete") 247 # 4-------------------------------------------- 248 self.logger.logprint("Executing postscrape chores") 249 self.postscrape_chores() 250 self.logger.logprint("Brew complete") 251 return results 252 except Exception as e: 253 print(e) 254 self.logger.exception("Exception occured during brew():") 255 256 257def get_args() -> argparse.Namespace: 258 parser = argparse.ArgumentParser( 259 prog="Brewer", description="Invoke `Brewer` from the command line." 260 ) 261 262 parser.add_argument( 263 "subgruel_classes", 264 type=str, 265 nargs="*", 266 help=""" A list of Gruel scraper class names to find and import. """, 267 ) 268 parser.add_argument( 269 "-e", 270 "--excludes", 271 type=str, 272 nargs="*", 273 default=[], 274 help=""" A list of glob style file patterns to exclude from the scan. """, 275 ) 276 parser.add_argument( 277 "-i", 278 "--includes", 279 type=str, 280 nargs="*", 281 default=["*.py"], 282 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 283 ) 284 parser.add_argument( 285 "-p", 286 "--path", 287 type=str, 288 default=Pathier.cwd(), 289 help=""" The directory path to scan. Defaults to the current working directory. """, 290 ) 291 parser.add_argument( 292 "-r", 293 "--recursive", 294 action="store_true", 295 help=""" Whether -p/--path should be scanned recursively or not. """, 296 ) 297 parser.add_argument( 298 "-l", 299 "--log_dir", 300 type=str, 301 default=None, 302 help=""" The directory to save the brew log to.""", 303 ) 304 args = parser.parse_args() 305 args.path = Pathier(args.path) 306 307 return args 308 309 310def main(args: argparse.Namespace | None = None): 311 if not args: 312 args = get_args() 313 finder = GruelFinder( 314 args.subgruel_classes, 315 args.excludes, 316 args.path, 317 args.includes, 318 args.recursive, 319 args.log_dir, 320 ) 321 brewer = Brewer( 322 finder.find(), 323 args.log_dir, 324 ) 325 brewer.brew() 326 327 328if __name__ == "__main__": 329 main(get_args())
18class GruelFinder: 19 """Find and load classes that subclass `Gruel`.""" 20 21 def __init__( 22 self, 23 subgruel_classes: list[str] = ["*"], 24 file_exclude_patterns: list[str] = [], 25 scan_path: Pathier | None = None, 26 file_include_patterns: list[str] = ["*.py"], 27 recursive: bool = True, 28 log_dir: Pathish | None = None, 29 ): 30 """#### :params: 31 32 `subgruel_classes`: A list of class names for scrapers that should be loaded. 33 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 34 Can use wildcard ('*') patterns for matching. 35 36 `file_exclude_patterns`: Files that match these patterns will not be scanned. 37 38 `scan_path`: The path to scan for scraper classes. 39 40 `file_include_patterns`: Files that match these patterns will be scanned. 41 42 `recursive`: Whether the scan should be recursive or not. 43 44 `log_dir`: The directory this instance's log should be saved to. 45 If `None`, it will be saved to the current working directory. 46 47 Will find and load all classes in the "scrapers" directory that inherit from `Gruel` 48 and start with "MySubGruel", but don't contain "Scratch" in the name: 49 >>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers") 50 >>> gruels = finder.find()""" 51 self.subgruel_classes = subgruel_classes 52 self.file_exclude_patterns = file_exclude_patterns 53 self.scan_path = scan_path or Pathier.cwd() 54 self.file_include_patterns = file_include_patterns 55 self.recursive = recursive 56 self.logger = loggi.getLogger( 57 "gruel_finder", Pathier(log_dir) if log_dir else Pathier.cwd() 58 ) 59 60 def get_bases(self, object: Any) -> list[Any]: 61 """Returns a recursive list of all the classes `object` inherits from.""" 62 parents = [] 63 bases = object.__bases__ 64 if not bases: 65 return parents 66 for base in bases: 67 parents.append(base) 68 parents.extend(self.get_bases(base)) 69 return parents 70 71 def is_subgruel(self, object: Any) -> bool: 72 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 73 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 74 return False 75 return True 76 77 def glob_files(self) -> list[Pathier]: 78 """Search `self.scan_path` for files according to `self.file_include_patterns` and `self.file_exclude_patterns`. 79 80 Returns the file list.""" 81 globber = self.scan_path.rglob if self.recursive else self.scan_path.glob 82 files = [ 83 str(file) 84 for pattern in self.file_include_patterns 85 for file in globber(pattern) 86 ] 87 files = [ 88 Pathier(file) 89 for file in younotyou(files, exclude_patterns=self.file_exclude_patterns) 90 ] 91 return files 92 93 def load_module_from_file(self, file: Pathier) -> ModuleType | None: 94 """Attempts to load and return a module from `file`.""" 95 module_name = file.stem 96 try: 97 module = importlib.machinery.SourceFileLoader( 98 module_name, str(file) 99 ).load_module() 100 self.logger.info(f"Successfully imported `{module_name}` from `{file}`.") 101 return module 102 except Exception as e: 103 self.logger.exception(f"Failed to load `{module_name}` from `{file}`.") 104 105 def strain_for_gruel(self, modules: list[ModuleType]) -> list[Type[Gruel]]: 106 """Searches `modules` for classes that inherit from `Gruel` and are in `self.subgruel_classes`. 107 108 Returns the list of classes.""" 109 matcher = Matcher(self.subgruel_classes) 110 return [ 111 getattr(module, class_) 112 for module in modules 113 for class_ in dir(module) 114 if class_ in matcher and self.is_subgruel(getattr(module, class_)) 115 ] 116 117 def find(self) -> list[Type[Gruel]]: 118 """Run the scan and return `Gruel` subclasses.""" 119 files = self.glob_files() 120 modules = [] 121 for file in files: 122 if module := self.load_module_from_file(file): 123 modules.append(module) 124 return self.strain_for_gruel(modules)
Find and load classes that subclass Gruel
.
21 def __init__( 22 self, 23 subgruel_classes: list[str] = ["*"], 24 file_exclude_patterns: list[str] = [], 25 scan_path: Pathier | None = None, 26 file_include_patterns: list[str] = ["*.py"], 27 recursive: bool = True, 28 log_dir: Pathish | None = None, 29 ): 30 """#### :params: 31 32 `subgruel_classes`: A list of class names for scrapers that should be loaded. 33 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 34 Can use wildcard ('*') patterns for matching. 35 36 `file_exclude_patterns`: Files that match these patterns will not be scanned. 37 38 `scan_path`: The path to scan for scraper classes. 39 40 `file_include_patterns`: Files that match these patterns will be scanned. 41 42 `recursive`: Whether the scan should be recursive or not. 43 44 `log_dir`: The directory this instance's log should be saved to. 45 If `None`, it will be saved to the current working directory. 46 47 Will find and load all classes in the "scrapers" directory that inherit from `Gruel` 48 and start with "MySubGruel", but don't contain "Scratch" in the name: 49 >>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers") 50 >>> gruels = finder.find()""" 51 self.subgruel_classes = subgruel_classes 52 self.file_exclude_patterns = file_exclude_patterns 53 self.scan_path = scan_path or Pathier.cwd() 54 self.file_include_patterns = file_include_patterns 55 self.recursive = recursive 56 self.logger = loggi.getLogger( 57 "gruel_finder", Pathier(log_dir) if log_dir else Pathier.cwd() 58 )
:params:
subgruel_classes
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
Can use wildcard ('*') patterns for matching.
file_exclude_patterns
: Files that match these patterns will not be scanned.
scan_path
: The path to scan for scraper classes.
file_include_patterns
: Files that match these patterns will be scanned.
recursive
: Whether the scan should be recursive or not.
log_dir
: The directory this instance's log should be saved to.
If None
, it will be saved to the current working directory.
Will find and load all classes in the "scrapers" directory that inherit from Gruel
and start with "MySubGruel", but don't contain "Scratch" in the name:
>>> finder = finder(["MySubGruel*"], ["*Scratch*"], "scrapers")
>>> gruels = finder.find()
60 def get_bases(self, object: Any) -> list[Any]: 61 """Returns a recursive list of all the classes `object` inherits from.""" 62 parents = [] 63 bases = object.__bases__ 64 if not bases: 65 return parents 66 for base in bases: 67 parents.append(base) 68 parents.extend(self.get_bases(base)) 69 return parents
Returns a recursive list of all the classes object
inherits from.
71 def is_subgruel(self, object: Any) -> bool: 72 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 73 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 74 return False 75 return True
Returns whether object
inherits from Gruel
somewhere in its ancestory.
77 def glob_files(self) -> list[Pathier]: 78 """Search `self.scan_path` for files according to `self.file_include_patterns` and `self.file_exclude_patterns`. 79 80 Returns the file list.""" 81 globber = self.scan_path.rglob if self.recursive else self.scan_path.glob 82 files = [ 83 str(file) 84 for pattern in self.file_include_patterns 85 for file in globber(pattern) 86 ] 87 files = [ 88 Pathier(file) 89 for file in younotyou(files, exclude_patterns=self.file_exclude_patterns) 90 ] 91 return files
Search self.scan_path
for files according to self.file_include_patterns
and self.file_exclude_patterns
.
Returns the file list.
93 def load_module_from_file(self, file: Pathier) -> ModuleType | None: 94 """Attempts to load and return a module from `file`.""" 95 module_name = file.stem 96 try: 97 module = importlib.machinery.SourceFileLoader( 98 module_name, str(file) 99 ).load_module() 100 self.logger.info(f"Successfully imported `{module_name}` from `{file}`.") 101 return module 102 except Exception as e: 103 self.logger.exception(f"Failed to load `{module_name}` from `{file}`.")
Attempts to load and return a module from file
.
105 def strain_for_gruel(self, modules: list[ModuleType]) -> list[Type[Gruel]]: 106 """Searches `modules` for classes that inherit from `Gruel` and are in `self.subgruel_classes`. 107 108 Returns the list of classes.""" 109 matcher = Matcher(self.subgruel_classes) 110 return [ 111 getattr(module, class_) 112 for module in modules 113 for class_ in dir(module) 114 if class_ in matcher and self.is_subgruel(getattr(module, class_)) 115 ]
Searches modules
for classes that inherit from Gruel
and are in self.subgruel_classes
.
Returns the list of classes.
117 def find(self) -> list[Type[Gruel]]: 118 """Run the scan and return `Gruel` subclasses.""" 119 files = self.glob_files() 120 modules = [] 121 for file in files: 122 if module := self.load_module_from_file(file): 123 modules.append(module) 124 return self.strain_for_gruel(modules)
Run the scan and return Gruel
subclasses.
127class Brewer: 128 """Use to do multithreaded execution of a list of scrapers. 129 130 Intended to be used with `Gruel` scrapers, but anything with a `scrape` method can be passed. 131 132 To run any `Gruel` scrapers from the current directory: 133 >>> Brewer(GruelFinder().find()).brew() 134 135 The `prescrape_chores` and `postscrape_chores` can be set/overridden like the same methods in `Gruel`. 136 137 When calling the `brew` method they will be executed once before and after all the scrapers have been executed. 138 139 i.e. 140 >>> brewer = Brewer(GruelFinder().find()) 141 >>> brewer.prescrape_chores() 142 >>> results = brewer.scrape() 143 >>> brewer.postscrape_chores() 144 145 is equivalent to 146 >>> results = Brewer(GruelFinder().find()).brew() 147 148 except `brew()` has some logging.""" 149 150 def __init__( 151 self, 152 scrapers: Sequence[Any], 153 scraper_args: Sequence[Sequence[Any]] = [], 154 scraper_kwargs: Sequence[dict[str, Any]] = [], 155 log_dir: Pathish | None = None, 156 ): 157 """#### :params: 158 159 `scrapers`: A list of scraper classes to initialize and execute. 160 A scraper should not be instantiated before being passed. 161 When `Brewer` runs a scraper it will instantiate the object at execution time and call it's `scrape` method. 162 163 `scraper_args`: A list where each element is a list of positional arguments to be passed to the corresponding scraper's `__init__` function. 164 165 `scraper_kwargs`: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's `__init__` function. 166 167 `log_dir`: The directory to store `Brewer` logs in. Defaults to the current working directory. 168 169 e.g. 170 >>> class MyGruel(Gruel): 171 >>> def __init__(self, value:int): 172 >>> super().__init__() 173 >>> self.value = value 174 >>> 175 >>> def scrape(self)->int: 176 >>> return self.value 177 >>> 178 >>> num_scrapers = 5 179 >>> values = list(range(5)) 180 >>> brewer = Brewer( 181 >>> [MyGruel]*num_scrapers, 182 >>> [(val,) for val in values] 183 >>> results = brewer.brew() 184 >>> print(results) 185 >>> [0, 1, 2, 3, 4]""" 186 self._init_logger(log_dir) 187 self.scrapers = scrapers 188 num_scrapers = len(self.scrapers) 189 # Pad args and kwargs if there aren't any given 190 self.scraper_args = scraper_args or [[]] * num_scrapers 191 self.scraper_kwargs = scraper_kwargs or [{}] * num_scrapers 192 193 def _init_logger(self, log_dir: Pathish | None = None): 194 # When Brewer is subclassed, use that file's stem instead of `brewer` 195 log_dir = Pathier(log_dir) if log_dir else Pathier.cwd() 196 source_file = inspect.getsourcefile(type(self)) 197 if source_file: 198 log_name = Pathier(source_file).stem 199 else: 200 log_name = Pathier(__file__).stem 201 self.logger = loggi.getLogger(log_name, log_dir) 202 203 def prescrape_chores(self): 204 """Override to add any tasks to be done before running the scrapers.""" 205 ... 206 207 def postscrape_chores(self): 208 """Override to add any tasks to be done after running the scrapers.""" 209 ... 210 211 def _prep_scrapers(self) -> list[tuple[Any, Sequence[Any], dict[str, Any]]]: 212 return [ 213 (scraper, args, kwargs) 214 for scraper, args, kwargs in zip( 215 self.scrapers, self.scraper_args, self.scraper_kwargs 216 ) 217 ] 218 219 def scrape(self) -> list[Any]: 220 """Run the `scrape()` method for each scraper in `scrapers`. 221 222 Execution is multithreaded.""" 223 224 def execute(scraper, args, kwargs): 225 return scraper(*args, **kwargs).scrape() 226 227 pool = quickpool.ThreadPool( 228 [execute] * len(self.scrapers), self._prep_scrapers() 229 ) 230 return pool.execute() 231 232 def brew(self) -> list[Any] | None: 233 """Execute pipeline. 234 235 1. self.prescrape_chores() 236 2. self.scrape() 237 3. self.postscrape_chores()""" 238 239 try: 240 self.logger.logprint("Beginning brew") 241 # 1-------------------------------------------- 242 self.logger.logprint("Executing prescrape chores") 243 self.prescrape_chores() 244 # 2-------------------------------------------- 245 self.logger.logprint("Starting scrape") 246 results = self.scrape() 247 self.logger.logprint("Scrape complete") 248 # 4-------------------------------------------- 249 self.logger.logprint("Executing postscrape chores") 250 self.postscrape_chores() 251 self.logger.logprint("Brew complete") 252 return results 253 except Exception as e: 254 print(e) 255 self.logger.exception("Exception occured during brew():")
Use to do multithreaded execution of a list of scrapers.
Intended to be used with Gruel
scrapers, but anything with a scrape
method can be passed.
To run any Gruel
scrapers from the current directory:
>>> Brewer(GruelFinder().find()).brew()
The prescrape_chores
and postscrape_chores
can be set/overridden like the same methods in Gruel
.
When calling the brew
method they will be executed once before and after all the scrapers have been executed.
i.e.
>>> brewer = Brewer(GruelFinder().find())
>>> brewer.prescrape_chores()
>>> results = brewer.scrape()
>>> brewer.postscrape_chores()
is equivalent to
>>> results = Brewer(GruelFinder().find()).brew()
except brew()
has some logging.
150 def __init__( 151 self, 152 scrapers: Sequence[Any], 153 scraper_args: Sequence[Sequence[Any]] = [], 154 scraper_kwargs: Sequence[dict[str, Any]] = [], 155 log_dir: Pathish | None = None, 156 ): 157 """#### :params: 158 159 `scrapers`: A list of scraper classes to initialize and execute. 160 A scraper should not be instantiated before being passed. 161 When `Brewer` runs a scraper it will instantiate the object at execution time and call it's `scrape` method. 162 163 `scraper_args`: A list where each element is a list of positional arguments to be passed to the corresponding scraper's `__init__` function. 164 165 `scraper_kwargs`: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's `__init__` function. 166 167 `log_dir`: The directory to store `Brewer` logs in. Defaults to the current working directory. 168 169 e.g. 170 >>> class MyGruel(Gruel): 171 >>> def __init__(self, value:int): 172 >>> super().__init__() 173 >>> self.value = value 174 >>> 175 >>> def scrape(self)->int: 176 >>> return self.value 177 >>> 178 >>> num_scrapers = 5 179 >>> values = list(range(5)) 180 >>> brewer = Brewer( 181 >>> [MyGruel]*num_scrapers, 182 >>> [(val,) for val in values] 183 >>> results = brewer.brew() 184 >>> print(results) 185 >>> [0, 1, 2, 3, 4]""" 186 self._init_logger(log_dir) 187 self.scrapers = scrapers 188 num_scrapers = len(self.scrapers) 189 # Pad args and kwargs if there aren't any given 190 self.scraper_args = scraper_args or [[]] * num_scrapers 191 self.scraper_kwargs = scraper_kwargs or [{}] * num_scrapers
:params:
scrapers
: A list of scraper classes to initialize and execute.
A scraper should not be instantiated before being passed.
When Brewer
runs a scraper it will instantiate the object at execution time and call it's scrape
method.
scraper_args
: A list where each element is a list of positional arguments to be passed to the corresponding scraper's __init__
function.
scraper_kwargs
: A list of dictionaries where each dictionary is a set of keyword arguments to be passed to the corresponding scraper's __init__
function.
log_dir
: The directory to store Brewer
logs in. Defaults to the current working directory.
e.g.
>>> class MyGruel(Gruel):
>>> def __init__(self, value:int):
>>> super().__init__()
>>> self.value = value
>>>
>>> def scrape(self)->int:
>>> return self.value
>>>
>>> num_scrapers = 5
>>> values = list(range(5))
>>> brewer = Brewer(
>>> [MyGruel]*num_scrapers,
>>> [(val,) for val in values]
>>> results = brewer.brew()
>>> print(results)
>>> [0, 1, 2, 3, 4]
203 def prescrape_chores(self): 204 """Override to add any tasks to be done before running the scrapers.""" 205 ...
Override to add any tasks to be done before running the scrapers.
207 def postscrape_chores(self): 208 """Override to add any tasks to be done after running the scrapers.""" 209 ...
Override to add any tasks to be done after running the scrapers.
219 def scrape(self) -> list[Any]: 220 """Run the `scrape()` method for each scraper in `scrapers`. 221 222 Execution is multithreaded.""" 223 224 def execute(scraper, args, kwargs): 225 return scraper(*args, **kwargs).scrape() 226 227 pool = quickpool.ThreadPool( 228 [execute] * len(self.scrapers), self._prep_scrapers() 229 ) 230 return pool.execute()
Run the scrape()
method for each scraper in scrapers
.
Execution is multithreaded.
232 def brew(self) -> list[Any] | None: 233 """Execute pipeline. 234 235 1. self.prescrape_chores() 236 2. self.scrape() 237 3. self.postscrape_chores()""" 238 239 try: 240 self.logger.logprint("Beginning brew") 241 # 1-------------------------------------------- 242 self.logger.logprint("Executing prescrape chores") 243 self.prescrape_chores() 244 # 2-------------------------------------------- 245 self.logger.logprint("Starting scrape") 246 results = self.scrape() 247 self.logger.logprint("Scrape complete") 248 # 4-------------------------------------------- 249 self.logger.logprint("Executing postscrape chores") 250 self.postscrape_chores() 251 self.logger.logprint("Brew complete") 252 return results 253 except Exception as e: 254 print(e) 255 self.logger.exception("Exception occured during brew():")
Execute pipeline.
- self.prescrape_chores()
- self.scrape()
- self.postscrape_chores()
258def get_args() -> argparse.Namespace: 259 parser = argparse.ArgumentParser( 260 prog="Brewer", description="Invoke `Brewer` from the command line." 261 ) 262 263 parser.add_argument( 264 "subgruel_classes", 265 type=str, 266 nargs="*", 267 help=""" A list of Gruel scraper class names to find and import. """, 268 ) 269 parser.add_argument( 270 "-e", 271 "--excludes", 272 type=str, 273 nargs="*", 274 default=[], 275 help=""" A list of glob style file patterns to exclude from the scan. """, 276 ) 277 parser.add_argument( 278 "-i", 279 "--includes", 280 type=str, 281 nargs="*", 282 default=["*.py"], 283 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 284 ) 285 parser.add_argument( 286 "-p", 287 "--path", 288 type=str, 289 default=Pathier.cwd(), 290 help=""" The directory path to scan. Defaults to the current working directory. """, 291 ) 292 parser.add_argument( 293 "-r", 294 "--recursive", 295 action="store_true", 296 help=""" Whether -p/--path should be scanned recursively or not. """, 297 ) 298 parser.add_argument( 299 "-l", 300 "--log_dir", 301 type=str, 302 default=None, 303 help=""" The directory to save the brew log to.""", 304 ) 305 args = parser.parse_args() 306 args.path = Pathier(args.path) 307 308 return args
311def main(args: argparse.Namespace | None = None): 312 if not args: 313 args = get_args() 314 finder = GruelFinder( 315 args.subgruel_classes, 316 args.excludes, 317 args.path, 318 args.includes, 319 args.recursive, 320 args.log_dir, 321 ) 322 brewer = Brewer( 323 finder.find(), 324 args.log_dir, 325 ) 326 brewer.brew()