gruel.brewer
1import argparse 2import importlib 3import importlib.machinery 4import importlib.util 5import inspect 6import sys 7from typing import Any 8 9import loggi 10import quickpool 11from pathier import Pathier, Pathish 12from younotyou import younotyou 13 14from gruel import Gruel 15 16 17class Brewer: 18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 42 >>> brewer.brew()""" 43 self._init_logger() 44 self.subgruel_classes = subgruel_classes 45 self.file_exclude_patterns = file_exclude_patterns 46 self.file_include_patterns = file_include_patterns 47 self.scan_path = Pathier(scan_path) 48 self.recursive = recursive 49 50 def _init_logger(self): 51 # When Brewer is subclassed, use that file's stem instead of `brewer` 52 source_file = inspect.getsourcefile(type(self)) 53 if source_file: 54 log_name = Pathier(source_file).stem 55 else: 56 log_name = Pathier(__file__).stem 57 self.logger = loggi.getLogger(log_name) 58 59 def load_scrapers(self) -> list[Gruel]: 60 """Load scraper classes that inherit from `Gruel`. 61 62 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 63 64 #### :params: 65 66 `directory`: The path to scan for scraper classes. 67 68 `class_names`: A list of class names for scrapers that should be loaded. 69 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 70 71 `include_patterns`: Files that match these patterns will be scanned. 72 73 `exclude_patterns`: Files that match these patterns will not be scanned. 74 75 `recursive`: Whether the search should be recursive or not. 76 77 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 78 globber = self.scan_path.glob 79 if self.recursive: 80 globber = self.scan_path.rglob 81 files = [ 82 str(file) 83 for pattern in self.file_include_patterns 84 for file in globber(pattern) 85 ] 86 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 87 modules = [] 88 self._module_names = [] 89 for file in files: 90 module_name = Pathier(file).stem 91 try: 92 module = importlib.machinery.SourceFileLoader( 93 module_name, file 94 ).load_module() 95 except Exception as e: 96 self.logger.exception( 97 f"Failed to load module '{module_name}' from '{file}'." 98 ) 99 else: 100 self._module_names.append(module_name) 101 modules.append(module) 102 gruels = [ 103 getattr(module, class_) 104 for module in modules 105 for class_ in self.subgruel_classes 106 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 107 ] 108 self.logger.info( 109 "\n".join( 110 [f"Imported {len(gruels)} scrapers: "] 111 + [str(gruel) for gruel in gruels] 112 ) 113 ) 114 return gruels 115 116 def pop_modules(self): 117 """Unload modules.""" 118 for module in self._module_names: 119 sys.modules.pop(module) 120 self._module_names = [] 121 122 def get_bases(self, object: Any) -> list[Any]: 123 """Returns a recursive list of all the classes `object` inherits from.""" 124 parents = [] 125 bases = object.__bases__ 126 if not bases: 127 return parents 128 for base in bases: 129 parents.append(base) 130 parents.extend(self.get_bases(base)) 131 return parents 132 133 def is_subgruel(self, object: Any) -> bool: 134 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 135 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 136 return False 137 return True 138 139 def prescrape_chores(self): 140 """Override to add any tasks to be done before running the scrapers.""" 141 ... 142 143 def postscrape_chores(self): 144 """Override to add any tasks to be done after running the scrapers.""" 145 self.pop_modules() 146 147 def scrape(self, scrapers: list[Gruel]): 148 """Run the `scrape()` method for each scraper in `scrapers`. 149 150 Execution is multithreaded.""" 151 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 152 pool.execute() 153 154 def logprint(self, message: str): 155 """Log and print `message`.""" 156 self.logger.info(message) 157 print(message) 158 159 def brew(self): 160 """Execute pipeline. 161 162 1. self.prescrape_chores() 163 2. self.load_scrapers() 164 3. self.scrape() 165 4. self.postscrape_chores()""" 166 167 try: 168 self.logprint("Beginning brew") 169 # 1-------------------------------------------- 170 self.logprint("Executing prescrape chores") 171 self.prescrape_chores() 172 # 2-------------------------------------------- 173 self.logprint("Loading scrapers") 174 scrapers = self.load_scrapers() 175 print(f"Loaded {len(scrapers)} scrapers") 176 # 3-------------------------------------------- 177 self.logprint("Starting scrape") 178 self.scrape(scrapers) 179 self.logprint("Scrape complete") 180 # 4-------------------------------------------- 181 self.logprint("Executing postscrape chores") 182 self.postscrape_chores() 183 self.logprint("Brew complete") 184 except Exception as e: 185 print(e) 186 self.logger.exception("Exception occured during brew():") 187 188 189def get_args() -> argparse.Namespace: 190 parser = argparse.ArgumentParser() 191 192 parser.add_argument( 193 "subgruel_classes", 194 type=str, 195 nargs="*", 196 help=""" A list of Gruel scraper class names to find and import. """, 197 ) 198 parser.add_argument( 199 "-e", 200 "--excludes", 201 type=str, 202 nargs="*", 203 default=[], 204 help=""" A list of glob style file patterns to exclude from the scan. """, 205 ) 206 parser.add_argument( 207 "-i", 208 "--includes", 209 type=str, 210 nargs="*", 211 default=["*.py"], 212 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 213 ) 214 parser.add_argument( 215 "-p", 216 "--path", 217 type=str, 218 default=Pathier.cwd(), 219 help=""" The directory path to scan. Defaults to the current working directory. """, 220 ) 221 parser.add_argument( 222 "-r", 223 "--recursive", 224 action="store_true", 225 help=""" Whether -p/--path should be scanned recursively or not. """, 226 ) 227 args = parser.parse_args() 228 args.path = Pathier(args.path) 229 230 return args 231 232 233def main(args: argparse.Namespace | None = None): 234 if not args: 235 args = get_args() 236 brewer = Brewer( 237 args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive 238 ) 239 brewer.brew() 240 241 242if __name__ == "__main__": 243 main(get_args())
18class Brewer: 19 def __init__( 20 self, 21 subgruel_classes: list[str], 22 file_exclude_patterns: list[str] = [], 23 scan_path: Pathish = Pathier.cwd(), 24 file_include_patterns: list[str] = ["*.py"], 25 recursive: bool = True, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 43 >>> brewer.brew()""" 44 self._init_logger() 45 self.subgruel_classes = subgruel_classes 46 self.file_exclude_patterns = file_exclude_patterns 47 self.file_include_patterns = file_include_patterns 48 self.scan_path = Pathier(scan_path) 49 self.recursive = recursive 50 51 def _init_logger(self): 52 # When Brewer is subclassed, use that file's stem instead of `brewer` 53 source_file = inspect.getsourcefile(type(self)) 54 if source_file: 55 log_name = Pathier(source_file).stem 56 else: 57 log_name = Pathier(__file__).stem 58 self.logger = loggi.getLogger(log_name) 59 60 def load_scrapers(self) -> list[Gruel]: 61 """Load scraper classes that inherit from `Gruel`. 62 63 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 64 65 #### :params: 66 67 `directory`: The path to scan for scraper classes. 68 69 `class_names`: A list of class names for scrapers that should be loaded. 70 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 71 72 `include_patterns`: Files that match these patterns will be scanned. 73 74 `exclude_patterns`: Files that match these patterns will not be scanned. 75 76 `recursive`: Whether the search should be recursive or not. 77 78 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 79 globber = self.scan_path.glob 80 if self.recursive: 81 globber = self.scan_path.rglob 82 files = [ 83 str(file) 84 for pattern in self.file_include_patterns 85 for file in globber(pattern) 86 ] 87 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 88 modules = [] 89 self._module_names = [] 90 for file in files: 91 module_name = Pathier(file).stem 92 try: 93 module = importlib.machinery.SourceFileLoader( 94 module_name, file 95 ).load_module() 96 except Exception as e: 97 self.logger.exception( 98 f"Failed to load module '{module_name}' from '{file}'." 99 ) 100 else: 101 self._module_names.append(module_name) 102 modules.append(module) 103 gruels = [ 104 getattr(module, class_) 105 for module in modules 106 for class_ in self.subgruel_classes 107 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 108 ] 109 self.logger.info( 110 "\n".join( 111 [f"Imported {len(gruels)} scrapers: "] 112 + [str(gruel) for gruel in gruels] 113 ) 114 ) 115 return gruels 116 117 def pop_modules(self): 118 """Unload modules.""" 119 for module in self._module_names: 120 sys.modules.pop(module) 121 self._module_names = [] 122 123 def get_bases(self, object: Any) -> list[Any]: 124 """Returns a recursive list of all the classes `object` inherits from.""" 125 parents = [] 126 bases = object.__bases__ 127 if not bases: 128 return parents 129 for base in bases: 130 parents.append(base) 131 parents.extend(self.get_bases(base)) 132 return parents 133 134 def is_subgruel(self, object: Any) -> bool: 135 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 136 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 137 return False 138 return True 139 140 def prescrape_chores(self): 141 """Override to add any tasks to be done before running the scrapers.""" 142 ... 143 144 def postscrape_chores(self): 145 """Override to add any tasks to be done after running the scrapers.""" 146 self.pop_modules() 147 148 def scrape(self, scrapers: list[Gruel]): 149 """Run the `scrape()` method for each scraper in `scrapers`. 150 151 Execution is multithreaded.""" 152 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 153 pool.execute() 154 155 def logprint(self, message: str): 156 """Log and print `message`.""" 157 self.logger.info(message) 158 print(message) 159 160 def brew(self): 161 """Execute pipeline. 162 163 1. self.prescrape_chores() 164 2. self.load_scrapers() 165 3. self.scrape() 166 4. self.postscrape_chores()""" 167 168 try: 169 self.logprint("Beginning brew") 170 # 1-------------------------------------------- 171 self.logprint("Executing prescrape chores") 172 self.prescrape_chores() 173 # 2-------------------------------------------- 174 self.logprint("Loading scrapers") 175 scrapers = self.load_scrapers() 176 print(f"Loaded {len(scrapers)} scrapers") 177 # 3-------------------------------------------- 178 self.logprint("Starting scrape") 179 self.scrape(scrapers) 180 self.logprint("Scrape complete") 181 # 4-------------------------------------------- 182 self.logprint("Executing postscrape chores") 183 self.postscrape_chores() 184 self.logprint("Brew complete") 185 except Exception as e: 186 print(e) 187 self.logger.exception("Exception occured during brew():")
19 def __init__( 20 self, 21 subgruel_classes: list[str], 22 file_exclude_patterns: list[str] = [], 23 scan_path: Pathish = Pathier.cwd(), 24 file_include_patterns: list[str] = ["*.py"], 25 recursive: bool = True, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 43 >>> brewer.brew()""" 44 self._init_logger() 45 self.subgruel_classes = subgruel_classes 46 self.file_exclude_patterns = file_exclude_patterns 47 self.file_include_patterns = file_include_patterns 48 self.scan_path = Pathier(scan_path) 49 self.recursive = recursive
Run Gruel
scrapers.
:params:
subgruel_classes
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
file_exclude_patterns
: Files that match these patterns will not be scanned.
scan_path
: The path to scan for scraper classes.
file_include_patterns
: Files that match these patterns will be scanned.
recursive
: Whether the scan should be recursive or not.
>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
60 def load_scrapers(self) -> list[Gruel]: 61 """Load scraper classes that inherit from `Gruel`. 62 63 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 64 65 #### :params: 66 67 `directory`: The path to scan for scraper classes. 68 69 `class_names`: A list of class names for scrapers that should be loaded. 70 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 71 72 `include_patterns`: Files that match these patterns will be scanned. 73 74 `exclude_patterns`: Files that match these patterns will not be scanned. 75 76 `recursive`: Whether the search should be recursive or not. 77 78 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 79 globber = self.scan_path.glob 80 if self.recursive: 81 globber = self.scan_path.rglob 82 files = [ 83 str(file) 84 for pattern in self.file_include_patterns 85 for file in globber(pattern) 86 ] 87 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 88 modules = [] 89 self._module_names = [] 90 for file in files: 91 module_name = Pathier(file).stem 92 try: 93 module = importlib.machinery.SourceFileLoader( 94 module_name, file 95 ).load_module() 96 except Exception as e: 97 self.logger.exception( 98 f"Failed to load module '{module_name}' from '{file}'." 99 ) 100 else: 101 self._module_names.append(module_name) 102 modules.append(module) 103 gruels = [ 104 getattr(module, class_) 105 for module in modules 106 for class_ in self.subgruel_classes 107 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 108 ] 109 self.logger.info( 110 "\n".join( 111 [f"Imported {len(gruels)} scrapers: "] 112 + [str(gruel) for gruel in gruels] 113 ) 114 ) 115 return gruels
Load scraper classes that inherit from Gruel
.
NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape()
method is called.
:params:
directory
: The path to scan for scraper classes.
class_names
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
include_patterns
: Files that match these patterns will be scanned.
exclude_patterns
: Files that match these patterns will not be scanned.
recursive
: Whether the search should be recursive or not.
>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
117 def pop_modules(self): 118 """Unload modules.""" 119 for module in self._module_names: 120 sys.modules.pop(module) 121 self._module_names = []
Unload modules.
123 def get_bases(self, object: Any) -> list[Any]: 124 """Returns a recursive list of all the classes `object` inherits from.""" 125 parents = [] 126 bases = object.__bases__ 127 if not bases: 128 return parents 129 for base in bases: 130 parents.append(base) 131 parents.extend(self.get_bases(base)) 132 return parents
Returns a recursive list of all the classes object
inherits from.
134 def is_subgruel(self, object: Any) -> bool: 135 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 136 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 137 return False 138 return True
Returns whether object
inherits from Gruel
somewhere in its ancestory.
140 def prescrape_chores(self): 141 """Override to add any tasks to be done before running the scrapers.""" 142 ...
Override to add any tasks to be done before running the scrapers.
144 def postscrape_chores(self): 145 """Override to add any tasks to be done after running the scrapers.""" 146 self.pop_modules()
Override to add any tasks to be done after running the scrapers.
148 def scrape(self, scrapers: list[Gruel]): 149 """Run the `scrape()` method for each scraper in `scrapers`. 150 151 Execution is multithreaded.""" 152 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 153 pool.execute()
Run the scrape()
method for each scraper in scrapers
.
Execution is multithreaded.
155 def logprint(self, message: str): 156 """Log and print `message`.""" 157 self.logger.info(message) 158 print(message)
Log and print message
.
160 def brew(self): 161 """Execute pipeline. 162 163 1. self.prescrape_chores() 164 2. self.load_scrapers() 165 3. self.scrape() 166 4. self.postscrape_chores()""" 167 168 try: 169 self.logprint("Beginning brew") 170 # 1-------------------------------------------- 171 self.logprint("Executing prescrape chores") 172 self.prescrape_chores() 173 # 2-------------------------------------------- 174 self.logprint("Loading scrapers") 175 scrapers = self.load_scrapers() 176 print(f"Loaded {len(scrapers)} scrapers") 177 # 3-------------------------------------------- 178 self.logprint("Starting scrape") 179 self.scrape(scrapers) 180 self.logprint("Scrape complete") 181 # 4-------------------------------------------- 182 self.logprint("Executing postscrape chores") 183 self.postscrape_chores() 184 self.logprint("Brew complete") 185 except Exception as e: 186 print(e) 187 self.logger.exception("Exception occured during brew():")
Execute pipeline.
- self.prescrape_chores()
- self.load_scrapers()
- self.scrape()
- self.postscrape_chores()
190def get_args() -> argparse.Namespace: 191 parser = argparse.ArgumentParser() 192 193 parser.add_argument( 194 "subgruel_classes", 195 type=str, 196 nargs="*", 197 help=""" A list of Gruel scraper class names to find and import. """, 198 ) 199 parser.add_argument( 200 "-e", 201 "--excludes", 202 type=str, 203 nargs="*", 204 default=[], 205 help=""" A list of glob style file patterns to exclude from the scan. """, 206 ) 207 parser.add_argument( 208 "-i", 209 "--includes", 210 type=str, 211 nargs="*", 212 default=["*.py"], 213 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 214 ) 215 parser.add_argument( 216 "-p", 217 "--path", 218 type=str, 219 default=Pathier.cwd(), 220 help=""" The directory path to scan. Defaults to the current working directory. """, 221 ) 222 parser.add_argument( 223 "-r", 224 "--recursive", 225 action="store_true", 226 help=""" Whether -p/--path should be scanned recursively or not. """, 227 ) 228 args = parser.parse_args() 229 args.path = Pathier(args.path) 230 231 return args