Source code for scrapple.commands.run

"""
scrapple.commands.run
~~~~~~~~~~~~~~~~~~~~~

"""

from __future__ import print_function
import os
from colorama import init, Fore, Back

from scrapple.commands import command
from scrapple.selectors import xpath, css
from scrapple.utils.config import traverse_next, extract_fieldnames

[docs]class RunCommand(command.Command): """ Defines the execution of :ref:`run <command-run>` """ def __init__(self, args): super(RunCommand, self).__init__(args) init()
[docs] def execute_command(self): """ The run command implements the web content extractor corresponding to the given \ configuration file. The execute_command() validates the input project name and opens the JSON \ configuration file. The run() method handles the execution of the extractor run. The extractor implementation follows these primary steps : 1. Selects the appropriate :ref:`selector class <implementation-selectors>` through \ a dynamic dispatch, with the selector_type argument from the CLI input. #. Iterate through the data section in level-0 of the configuration file. \ On each data item, call the extract_content() method from the selector class to \ extract the content according to the specified extractor rule. #. If there are multiple levels of the extractor, i.e, if there is a 'next' \ attribute in the configuration file, call the traverse_next() \ :ref:`utility function <implementation-utils>` and parse through successive levels \ of the configuration file. #. According to the --output_type argument, the result data is saved in a JSON \ document or a CSV document. """ print(Back.GREEN + Fore.BLACK + "Scrapple Run") print(Back.RESET + Fore.RESET) try: import json with open(self.args['<projectname>'] + '.json', 'r') as f: self.config = json.load(f) self.run() except IOError: print(Back.WHITE + Fore.RED + self.args['<projectname>'], ".json does not ", \ "exist. Use ``scrapple genconfig``." + Back.RESET + Fore.RESET, sep="")
def run(self): selectorClass = getattr( eval(self.config['selector_type']), self.config['selector_type'].title() + 'Selector' ) results = dict() results['project'] = self.args['<projectname>'] results['data'] = list() try: result = dict() print() print(Back.YELLOW + Fore.BLUE + "Loading page ", self.config['scraping']['url'] \ + Back.RESET + Fore.RESET) selector = selectorClass(self.config['scraping']['url']) for attribute in self.config['scraping']['data']: if attribute['field'] != "": print("\nExtracting", attribute['field'], "attribute", sep=' ') result[attribute['field']] = selector.extract_content(attribute['selector'], attribute['attr'], attribute['default']) if not self.config['scraping'].get('next'): results['data'].append(result) else: for next in self.config['scraping']['next']: for r in traverse_next(selector, next, result): results['data'].append(r) except KeyboardInterrupt: pass except Exception as e: print(e) finally: if self.args['--output_type'] == 'json': import json with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.json'), \ 'w') as f: json.dump(results, f) elif self.args['--output_type'] == 'csv': import csv with open(os.path.join(os.getcwd(), self.args['<output_filename>'] + '.csv'), \ 'w') as f: fields = extract_fieldnames(self.config) writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(results['data']) print() print(Back.WHITE + Fore.RED + self.args['<output_filename>'], \ ".", self.args['--output_type'], " has been created" \ + Back.RESET + Fore.RESET, sep="")