Source code for scrapple.utils.config
"""
scrapple.utils.config
~~~~~~~~~~~~~~~~~~~~~
Functions related to traversing the configuration file
"""
from __future__ import print_function
from colorama import init, Fore, Back
init()
[docs]def traverse_next(page, next, results):
"""
Recursive generator to traverse through the next attribute and \
crawl through the links to be followed.
:param page: The current page being parsed
:param next: The next attribute of the current scraping dict
:param results: The current extracted content, stored in a dict
:return: The extracted content, through a generator
"""
for link in page.extract_links(next['follow_link']):
print(Back.YELLOW + Fore.BLUE + "Loading page ", link.url + Back.RESET + Fore.RESET)
r = results.copy()
for attribute in next['scraping'].get('data'):
if attribute['field'] != "":
print("\nExtracting", attribute['field'], "attribute", sep=' ')
r[attribute['field']] = link.extract_content(attribute['selector'], attribute['attr'], attribute['default'])
if not next['scraping'].get('next'):
yield r
else:
for next2 in next['scraping'].get('next'):
for result in traverse_next(link, next2, r):
yield result
[docs]def get_fields(config):
"""
Recursive generator that yields the field names in the config file
:param config: The configuration file that contains the specification of the extractor
:return: The field names in the config file, through a generator
"""
for data in config['scraping']['data']:
if data['field'] != '':
yield data['field']
if 'next' in config['scraping']:
for n in config['scraping']['next']:
for f in get_fields(n):
yield f