gruel.brewer
1import argparse 2import importlib 3import importlib.machinery 4import importlib.util 5import inspect 6import logging 7import sys 8from typing import Any 9 10import quickpool 11from pathier import Pathier, Pathish 12from younotyou import younotyou 13 14from gruel import Gruel 15 16 17class Brewer: 18 def __init__( 19 self, 20 subgruel_classes: list[str], 21 file_exclude_patterns: list[str] = [], 22 scan_path: Pathish = Pathier.cwd(), 23 file_include_patterns: list[str] = ["*.py"], 24 recursive: bool = True, 25 ): 26 """Run `Gruel` scrapers. 27 28 #### :params: 29 30 `subgruel_classes`: A list of class names for scrapers that should be loaded. 31 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 32 33 `file_exclude_patterns`: Files that match these patterns will not be scanned. 34 35 `scan_path`: The path to scan for scraper classes. 36 37 `file_include_patterns`: Files that match these patterns will be scanned. 38 39 `recursive`: Whether the scan should be recursive or not. 40 41 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 42 >>> brewer.brew()""" 43 self._init_logger() 44 self.subgruel_classes = subgruel_classes 45 self.file_exclude_patterns = file_exclude_patterns 46 self.file_include_patterns = file_include_patterns 47 self.scan_path = Pathier(scan_path) 48 self.recursive = recursive 49 50 def _init_logger(self): 51 # When Brewer is subclassed, use that file's stem instead of `brewer` 52 source_file = inspect.getsourcefile(type(self)) 53 if source_file: 54 log_name = Pathier(source_file).stem 55 else: 56 log_name = Pathier(__file__).stem 57 self.logger = logging.getLogger(log_name) 58 if not self.logger.hasHandlers(): 59 handler = logging.FileHandler(log_name + ".log") 60 handler.setFormatter( 61 logging.Formatter( 62 "{levelname}|-|{asctime}|-|{message}", 63 style="{", 64 datefmt="%m/%d/%Y %I:%M:%S %p", 65 ) 66 ) 67 self.logger.addHandler(handler) 68 self.logger.setLevel(logging.INFO) 69 70 def load_scrapers(self) -> list[Gruel]: 71 """Load scraper classes that inherit from `Gruel`. 72 73 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 74 75 #### :params: 76 77 `directory`: The path to scan for scraper classes. 78 79 `class_names`: A list of class names for scrapers that should be loaded. 80 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 81 82 `include_patterns`: Files that match these patterns will be scanned. 83 84 `exclude_patterns`: Files that match these patterns will not be scanned. 85 86 `recursive`: Whether the search should be recursive or not. 87 88 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 89 globber = self.scan_path.glob 90 if self.recursive: 91 globber = self.scan_path.rglob 92 files = [ 93 str(file) 94 for pattern in self.file_include_patterns 95 for file in globber(pattern) 96 ] 97 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 98 modules = [] 99 self._module_names = [] 100 for file in files: 101 module_name = Pathier(file).stem 102 try: 103 module = importlib.machinery.SourceFileLoader( 104 module_name, file 105 ).load_module() 106 except Exception as e: 107 self.logger.exception( 108 f"Failed to load module '{module_name}' from '{file}'." 109 ) 110 else: 111 self._module_names.append(module_name) 112 modules.append(module) 113 gruels = [ 114 getattr(module, class_) 115 for module in modules 116 for class_ in self.subgruel_classes 117 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 118 ] 119 self.logger.info( 120 "\n".join( 121 [f"Imported {len(gruels)} scrapers: "] 122 + [str(gruel) for gruel in gruels] 123 ) 124 ) 125 return gruels 126 127 def pop_modules(self): 128 """Unload modules.""" 129 for module in self._module_names: 130 sys.modules.pop(module) 131 self._module_names = [] 132 133 def get_bases(self, object: Any) -> list[Any]: 134 """Returns a recursive list of all the classes `object` inherits from.""" 135 parents = [] 136 bases = object.__bases__ 137 if not bases: 138 return parents 139 for base in bases: 140 parents.append(base) 141 parents.extend(self.get_bases(base)) 142 return parents 143 144 def is_subgruel(self, object: Any) -> bool: 145 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 146 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 147 return False 148 return True 149 150 def prescrape_chores(self): 151 """Override to add any tasks to be done before running the scrapers.""" 152 ... 153 154 def postscrape_chores(self): 155 """Override to add any tasks to be done after running the scrapers.""" 156 self.pop_modules() 157 158 def scrape(self, scrapers: list[Gruel]): 159 """Run the `scrape()` method for each scraper in `scrapers`. 160 161 Execution is multithreaded.""" 162 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 163 pool.execute() 164 165 def logprint(self, message: str): 166 """Log and print `message`.""" 167 self.logger.info(message) 168 print(message) 169 170 def brew(self): 171 """Execute pipeline. 172 173 1. self.prescrape_chores() 174 2. self.load_scrapers() 175 3. self.scrape() 176 4. self.postscrape_chores()""" 177 178 try: 179 self.logprint("Beginning brew") 180 # 1-------------------------------------------- 181 self.logprint("Executing prescrape chores") 182 self.prescrape_chores() 183 # 2-------------------------------------------- 184 self.logprint("Loading scrapers") 185 scrapers = self.load_scrapers() 186 print(f"Loaded {len(scrapers)} scrapers") 187 # 3-------------------------------------------- 188 self.logprint("Starting scrape") 189 self.scrape(scrapers) 190 self.logprint("Scrape complete") 191 # 4-------------------------------------------- 192 self.logprint("Executing postscrape chores") 193 self.postscrape_chores() 194 self.logprint("Brew complete") 195 except Exception as e: 196 print(e) 197 self.logger.exception("Exception occured during brew():") 198 199 200def get_args() -> argparse.Namespace: 201 parser = argparse.ArgumentParser() 202 203 parser.add_argument( 204 "subgruel_classes", 205 type=str, 206 nargs="*", 207 help=""" A list of Gruel scraper class names to find and import. """, 208 ) 209 parser.add_argument( 210 "-e", 211 "--excludes", 212 type=str, 213 nargs="*", 214 default=[], 215 help=""" A list of glob style file patterns to exclude from the scan. """, 216 ) 217 parser.add_argument( 218 "-i", 219 "--includes", 220 type=str, 221 nargs="*", 222 default=["*.py"], 223 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 224 ) 225 parser.add_argument( 226 "-p", 227 "--path", 228 type=str, 229 default=Pathier.cwd(), 230 help=""" The directory path to scan. Defaults to the current working directory. """, 231 ) 232 parser.add_argument( 233 "-r", 234 "--recursive", 235 action="store_true", 236 help=""" Whether -p/--path should be scanned recursively or not. """, 237 ) 238 args = parser.parse_args() 239 args.path = Pathier(args.path) 240 241 return args 242 243 244def main(args: argparse.Namespace | None = None): 245 if not args: 246 args = get_args() 247 brewer = Brewer( 248 args.subgruel_classes, args.excludes, args.path, args.includes, args.recursive 249 ) 250 brewer.brew() 251 252 253if __name__ == "__main__": 254 main(get_args())
18class Brewer: 19 def __init__( 20 self, 21 subgruel_classes: list[str], 22 file_exclude_patterns: list[str] = [], 23 scan_path: Pathish = Pathier.cwd(), 24 file_include_patterns: list[str] = ["*.py"], 25 recursive: bool = True, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 43 >>> brewer.brew()""" 44 self._init_logger() 45 self.subgruel_classes = subgruel_classes 46 self.file_exclude_patterns = file_exclude_patterns 47 self.file_include_patterns = file_include_patterns 48 self.scan_path = Pathier(scan_path) 49 self.recursive = recursive 50 51 def _init_logger(self): 52 # When Brewer is subclassed, use that file's stem instead of `brewer` 53 source_file = inspect.getsourcefile(type(self)) 54 if source_file: 55 log_name = Pathier(source_file).stem 56 else: 57 log_name = Pathier(__file__).stem 58 self.logger = logging.getLogger(log_name) 59 if not self.logger.hasHandlers(): 60 handler = logging.FileHandler(log_name + ".log") 61 handler.setFormatter( 62 logging.Formatter( 63 "{levelname}|-|{asctime}|-|{message}", 64 style="{", 65 datefmt="%m/%d/%Y %I:%M:%S %p", 66 ) 67 ) 68 self.logger.addHandler(handler) 69 self.logger.setLevel(logging.INFO) 70 71 def load_scrapers(self) -> list[Gruel]: 72 """Load scraper classes that inherit from `Gruel`. 73 74 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 75 76 #### :params: 77 78 `directory`: The path to scan for scraper classes. 79 80 `class_names`: A list of class names for scrapers that should be loaded. 81 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 82 83 `include_patterns`: Files that match these patterns will be scanned. 84 85 `exclude_patterns`: Files that match these patterns will not be scanned. 86 87 `recursive`: Whether the search should be recursive or not. 88 89 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 90 globber = self.scan_path.glob 91 if self.recursive: 92 globber = self.scan_path.rglob 93 files = [ 94 str(file) 95 for pattern in self.file_include_patterns 96 for file in globber(pattern) 97 ] 98 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 99 modules = [] 100 self._module_names = [] 101 for file in files: 102 module_name = Pathier(file).stem 103 try: 104 module = importlib.machinery.SourceFileLoader( 105 module_name, file 106 ).load_module() 107 except Exception as e: 108 self.logger.exception( 109 f"Failed to load module '{module_name}' from '{file}'." 110 ) 111 else: 112 self._module_names.append(module_name) 113 modules.append(module) 114 gruels = [ 115 getattr(module, class_) 116 for module in modules 117 for class_ in self.subgruel_classes 118 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 119 ] 120 self.logger.info( 121 "\n".join( 122 [f"Imported {len(gruels)} scrapers: "] 123 + [str(gruel) for gruel in gruels] 124 ) 125 ) 126 return gruels 127 128 def pop_modules(self): 129 """Unload modules.""" 130 for module in self._module_names: 131 sys.modules.pop(module) 132 self._module_names = [] 133 134 def get_bases(self, object: Any) -> list[Any]: 135 """Returns a recursive list of all the classes `object` inherits from.""" 136 parents = [] 137 bases = object.__bases__ 138 if not bases: 139 return parents 140 for base in bases: 141 parents.append(base) 142 parents.extend(self.get_bases(base)) 143 return parents 144 145 def is_subgruel(self, object: Any) -> bool: 146 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 147 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 148 return False 149 return True 150 151 def prescrape_chores(self): 152 """Override to add any tasks to be done before running the scrapers.""" 153 ... 154 155 def postscrape_chores(self): 156 """Override to add any tasks to be done after running the scrapers.""" 157 self.pop_modules() 158 159 def scrape(self, scrapers: list[Gruel]): 160 """Run the `scrape()` method for each scraper in `scrapers`. 161 162 Execution is multithreaded.""" 163 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 164 pool.execute() 165 166 def logprint(self, message: str): 167 """Log and print `message`.""" 168 self.logger.info(message) 169 print(message) 170 171 def brew(self): 172 """Execute pipeline. 173 174 1. self.prescrape_chores() 175 2. self.load_scrapers() 176 3. self.scrape() 177 4. self.postscrape_chores()""" 178 179 try: 180 self.logprint("Beginning brew") 181 # 1-------------------------------------------- 182 self.logprint("Executing prescrape chores") 183 self.prescrape_chores() 184 # 2-------------------------------------------- 185 self.logprint("Loading scrapers") 186 scrapers = self.load_scrapers() 187 print(f"Loaded {len(scrapers)} scrapers") 188 # 3-------------------------------------------- 189 self.logprint("Starting scrape") 190 self.scrape(scrapers) 191 self.logprint("Scrape complete") 192 # 4-------------------------------------------- 193 self.logprint("Executing postscrape chores") 194 self.postscrape_chores() 195 self.logprint("Brew complete") 196 except Exception as e: 197 print(e) 198 self.logger.exception("Exception occured during brew():")
19 def __init__( 20 self, 21 subgruel_classes: list[str], 22 file_exclude_patterns: list[str] = [], 23 scan_path: Pathish = Pathier.cwd(), 24 file_include_patterns: list[str] = ["*.py"], 25 recursive: bool = True, 26 ): 27 """Run `Gruel` scrapers. 28 29 #### :params: 30 31 `subgruel_classes`: A list of class names for scrapers that should be loaded. 32 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 33 34 `file_exclude_patterns`: Files that match these patterns will not be scanned. 35 36 `scan_path`: The path to scan for scraper classes. 37 38 `file_include_patterns`: Files that match these patterns will be scanned. 39 40 `recursive`: Whether the scan should be recursive or not. 41 42 >>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers") 43 >>> brewer.brew()""" 44 self._init_logger() 45 self.subgruel_classes = subgruel_classes 46 self.file_exclude_patterns = file_exclude_patterns 47 self.file_include_patterns = file_include_patterns 48 self.scan_path = Pathier(scan_path) 49 self.recursive = recursive
Run Gruel
scrapers.
:params:
subgruel_classes
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
file_exclude_patterns
: Files that match these patterns will not be scanned.
scan_path
: The path to scan for scraper classes.
file_include_patterns
: Files that match these patterns will be scanned.
recursive
: Whether the scan should be recursive or not.
>>> brewer = Brewer(["VenueScraper"], ["*template*", "*giggruel*"], "scrapers")
>>> brewer.brew()
71 def load_scrapers(self) -> list[Gruel]: 72 """Load scraper classes that inherit from `Gruel`. 73 74 NOTE: Classes are loaded, but scraper objects are not instantiated until the `scrape()` method is called. 75 76 #### :params: 77 78 `directory`: The path to scan for scraper classes. 79 80 `class_names`: A list of class names for scrapers that should be loaded. 81 In order to be loaded, a scraper class must have a name in this list and have `Gruel` somewhere in its inheritance hierarchy. 82 83 `include_patterns`: Files that match these patterns will be scanned. 84 85 `exclude_patterns`: Files that match these patterns will not be scanned. 86 87 `recursive`: Whether the search should be recursive or not. 88 89 >>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])""" 90 globber = self.scan_path.glob 91 if self.recursive: 92 globber = self.scan_path.rglob 93 files = [ 94 str(file) 95 for pattern in self.file_include_patterns 96 for file in globber(pattern) 97 ] 98 files = younotyou(files, exclude_patterns=self.file_exclude_patterns) 99 modules = [] 100 self._module_names = [] 101 for file in files: 102 module_name = Pathier(file).stem 103 try: 104 module = importlib.machinery.SourceFileLoader( 105 module_name, file 106 ).load_module() 107 except Exception as e: 108 self.logger.exception( 109 f"Failed to load module '{module_name}' from '{file}'." 110 ) 111 else: 112 self._module_names.append(module_name) 113 modules.append(module) 114 gruels = [ 115 getattr(module, class_) 116 for module in modules 117 for class_ in self.subgruel_classes 118 if class_ in dir(module) and self.is_subgruel(getattr(module, class_)) 119 ] 120 self.logger.info( 121 "\n".join( 122 [f"Imported {len(gruels)} scrapers: "] 123 + [str(gruel) for gruel in gruels] 124 ) 125 ) 126 return gruels
Load scraper classes that inherit from Gruel
.
NOTE: Classes are loaded, but scraper objects are not instantiated until the scrape()
method is called.
:params:
directory
: The path to scan for scraper classes.
class_names
: A list of class names for scrapers that should be loaded.
In order to be loaded, a scraper class must have a name in this list and have Gruel
somewhere in its inheritance hierarchy.
include_patterns
: Files that match these patterns will be scanned.
exclude_patterns
: Files that match these patterns will not be scanned.
recursive
: Whether the search should be recursive or not.
>>> load_scrapers("getToTheGig/scrapers", ["VenueScraper"], ["*.py"], ["*template*", "*giggruel*"])
128 def pop_modules(self): 129 """Unload modules.""" 130 for module in self._module_names: 131 sys.modules.pop(module) 132 self._module_names = []
Unload modules.
134 def get_bases(self, object: Any) -> list[Any]: 135 """Returns a recursive list of all the classes `object` inherits from.""" 136 parents = [] 137 bases = object.__bases__ 138 if not bases: 139 return parents 140 for base in bases: 141 parents.append(base) 142 parents.extend(self.get_bases(base)) 143 return parents
Returns a recursive list of all the classes object
inherits from.
145 def is_subgruel(self, object: Any) -> bool: 146 """Returns whether `object` inherits from `Gruel` somewhere in its ancestory.""" 147 if not inspect.isclass(object) or Gruel not in self.get_bases(object): 148 return False 149 return True
Returns whether object
inherits from Gruel
somewhere in its ancestory.
151 def prescrape_chores(self): 152 """Override to add any tasks to be done before running the scrapers.""" 153 ...
Override to add any tasks to be done before running the scrapers.
155 def postscrape_chores(self): 156 """Override to add any tasks to be done after running the scrapers.""" 157 self.pop_modules()
Override to add any tasks to be done after running the scrapers.
159 def scrape(self, scrapers: list[Gruel]): 160 """Run the `scrape()` method for each scraper in `scrapers`. 161 162 Execution is multithreaded.""" 163 pool = quickpool.ThreadPool([scraper().scrape for scraper in scrapers]) # type: ignore 164 pool.execute()
Run the scrape()
method for each scraper in scrapers
.
Execution is multithreaded.
166 def logprint(self, message: str): 167 """Log and print `message`.""" 168 self.logger.info(message) 169 print(message)
Log and print message
.
171 def brew(self): 172 """Execute pipeline. 173 174 1. self.prescrape_chores() 175 2. self.load_scrapers() 176 3. self.scrape() 177 4. self.postscrape_chores()""" 178 179 try: 180 self.logprint("Beginning brew") 181 # 1-------------------------------------------- 182 self.logprint("Executing prescrape chores") 183 self.prescrape_chores() 184 # 2-------------------------------------------- 185 self.logprint("Loading scrapers") 186 scrapers = self.load_scrapers() 187 print(f"Loaded {len(scrapers)} scrapers") 188 # 3-------------------------------------------- 189 self.logprint("Starting scrape") 190 self.scrape(scrapers) 191 self.logprint("Scrape complete") 192 # 4-------------------------------------------- 193 self.logprint("Executing postscrape chores") 194 self.postscrape_chores() 195 self.logprint("Brew complete") 196 except Exception as e: 197 print(e) 198 self.logger.exception("Exception occured during brew():")
Execute pipeline.
- self.prescrape_chores()
- self.load_scrapers()
- self.scrape()
- self.postscrape_chores()
201def get_args() -> argparse.Namespace: 202 parser = argparse.ArgumentParser() 203 204 parser.add_argument( 205 "subgruel_classes", 206 type=str, 207 nargs="*", 208 help=""" A list of Gruel scraper class names to find and import. """, 209 ) 210 parser.add_argument( 211 "-e", 212 "--excludes", 213 type=str, 214 nargs="*", 215 default=[], 216 help=""" A list of glob style file patterns to exclude from the scan. """, 217 ) 218 parser.add_argument( 219 "-i", 220 "--includes", 221 type=str, 222 nargs="*", 223 default=["*.py"], 224 help=""" A list of glob style file patterns to include in the scan. Defaults to "*.py". """, 225 ) 226 parser.add_argument( 227 "-p", 228 "--path", 229 type=str, 230 default=Pathier.cwd(), 231 help=""" The directory path to scan. Defaults to the current working directory. """, 232 ) 233 parser.add_argument( 234 "-r", 235 "--recursive", 236 action="store_true", 237 help=""" Whether -p/--path should be scanned recursively or not. """, 238 ) 239 args = parser.parse_args() 240 args.path = Pathier(args.path) 241 242 return args