Source code for goldendoodle.pipelines

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface

# Typical uses of item pipelines are:
#     cleansing HTML data
#     validating scraped data (checking that the items contain certain fields)
#     checking for duplicates (and dropping them)
#     storing the scraped item in a database


from datetime import datetime

from pickle import NONE

import regex
from genericpath import exists
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem

from goldendoodle.spiders import items
from goldendoodle.spiders.utils.gldnddl_utils import Gldnddl
import logging

logger = logging.getLogger(__name__)

[docs]class GoldendoodlePipeline:
[docs] def process_item(self, item, spider): """The final touches are made here. Args: item (Any): the harvest spider (Any): the spider Raises: DropItem: When the harvest is poor. Returns: Any: the harvest item Example of option=email without gldnddlSearchString: ==================================================== >>> import os >>> file = 'reports/findings_50.json' >>> if os.path.exists(file): ... os.remove(file) >>> url = 'https://model-enact-analyze-manage.de/souverain/index.php/2022/10/29/bang-now-goldendoodle-supports-regular-expressions/' >>> os.system(f'''scrapy crawl recherchedechaîne_gldnddl -O {file} -a option=email -a start_urls={url}''') 0 >>> import json >>> with open(file) as findingsreport: ... for finding in json.load(findingsreport): ... if finding["currentURL"] == url: ... print(str(str(finding['regex_finding']).split(',')[2]).split("'")[1]) ... break Golden[dot]Doodle[at]chien[dot]fr """ adapter = ItemAdapter(item) # FUTUREFEATURE supprimer les enregistrements en double (https://doc.scrapy.org/en/latest/topics/item-pipeline.html#duplicates-filter) if adapter.get('regex_finding') != 'None' or adapter.get('webdriver_finding') != 'None' or adapter.get('cleared_webdriver_finding') != 'None': logger.debug(f"item['currentURL']:{item['currentURL']}:adapter.get('regex_finding'):{adapter.get('regex_finding')}:adapter.get('webdriver_finding'):{adapter.get('webdriver_finding')}:adapter.get('cleared_webdriver_finding'):{adapter.get('cleared_webdriver_finding')}:") return item else: logger.info( f"{item['currentURL']} does not include search string:{Gldnddl.search_expression}:") raise DropItem(str(datetime.now()) + "I gldnddl ")