Source code for orangecontrib.text.guardian

""" This module fetches data from The Guardian API.

To use first create :class:`TheGuardianCredentials`:

    >>> from orangecontrib.text.guardian import TheGuardianCredentials
    >>> credentials = TheGuardianCredentials('<your-api-key>')

Then create :class:`TheGuardianAPI` object and use it for searching:

    >>> from orangecontrib.text.guardian import TheGuardianAPI
    >>> api = TheGuardianAPI(credentials)
    >>> corpus = api.search('Slovenia', max_documents=10)
    >>> len(corpus)
    10

"""
import math
import json
import os
from functools import partial

import requests
from Orange.data import (
    StringVariable,
    DiscreteVariable,
    ContinuousVariable,
    TimeVariable,
)
from dateutil.parser import isoparse

from orangecontrib.text.util import create_corpus

BASE_URL = 'http://content.guardianapis.com/search'
ARTICLES_PER_PAGE = 10


class APILimitError(Exception):
    pass


[docs]class TheGuardianCredentials: """ The Guardian API credentials. """
[docs] def __init__(self, key): """ Args: key (str): The Guardian API key. Use `test` for testing purposes. """ self.key = key
@property def valid(self): """ Check if given API key is valid. """ response = requests.get(BASE_URL, {'api-key': self.key}) return response.status_code == 200 def __eq__(self, other): return self.key == other.key
[docs]class TheGuardianAPI: class_vars = [ (partial(DiscreteVariable, "Section"), lambda doc: doc["sectionName"]), ] metas = [ (partial(StringVariable, "Headline"), lambda doc: doc["fields"]["headline"]), (partial(StringVariable, "Content"), lambda doc: doc["fields"]["bodyText"]), (partial(StringVariable, "Trail Text"), lambda doc: doc["fields"]["trailText"]), (partial(StringVariable, "HTML"), lambda doc: doc["fields"]["body"]), ( partial(TimeVariable, "Publication Date", have_time=1, have_date=1), lambda doc: isoparse(doc["webPublicationDate"]).timestamp(), ), (partial(DiscreteVariable, "Type"), lambda doc: doc["type"]), (partial(DiscreteVariable, "Language"), lambda doc: doc["fields"]["lang"]), ( partial(StringVariable, "Tags"), lambda doc: ", ".join(tag["webTitle"] for tag in doc["tags"]), ), (partial(StringVariable, "URL"), lambda doc: doc["webUrl"]), ( partial(ContinuousVariable, "Word Count", number_of_decimals=0), lambda doc: doc["fields"]["wordcount"], ), ] text_features = ["Headline", "Content"] # title_indices = [-1] # Headline
[docs] def __init__(self, credentials, on_progress=None, should_break=None): """ Args: credentials (:class:`TheGuardianCredentials`): The Guardian Creentials. on_progress (callable): Function for progress reporting. should_break (callable): Function for early stopping. """ self.per_page = ARTICLES_PER_PAGE self.pages = 0 self.credentials = credentials self.on_progress = on_progress or (lambda x, y: None) self.should_break = should_break or (lambda: False) self.results = []
def _search(self, query, from_date, to_date, page=1): data = self._build_query(query, from_date, to_date, page) response = requests.get(BASE_URL, data) response.encoding = "UTF-8" if response.status_code == 429: raise APILimitError("API limit exceeded") parsed = json.loads(response.text) if page == 1: # store number of pages self.pages = parsed['response']['pages'] self.results.extend(parsed['response']['results']) def _build_query(self, query, from_date=None, to_date=None, page=1): data = { 'q': query, 'api-key': self.credentials.key, 'page': str(page), 'show-fields': 'headline,trailText,body,bodyText,lang,wordcount', 'show-tags': 'all', } if from_date is not None: data['from-date'] = from_date if to_date is not None: data['to-date'] = to_date return data
[docs] def search(self, query, from_date=None, to_date=None, max_documents=None, accumulate=False): """ Search The Guardian API for articles. Args: query (str): A query for searching the articles by from_date (str): Search only articles newer than the date provided. Date should be in ISO format; e.g. '2016-12-31'. to_date (str): Search only articles older than the date provided. Date should be in ISO format; e.g. '2016-12-31'. max_documents (int): Maximum number of documents to retrieve. When not given, retrieve all documents. accumulate (bool): A flag indicating whether to accumulate results of multiple consequent search calls. Returns: :ref:`Corpus` """ if not accumulate: self.results = [] self._search(query, from_date, to_date) pages = math.ceil(max_documents/self.per_page) if max_documents else self.pages self.on_progress(self.per_page, pages * self.per_page) for p in range(2, pages+1): # to one based if self.should_break(): break self._search(query, from_date, to_date, p) self.on_progress(p*self.per_page, pages * self.per_page) return create_corpus( self.results, [], self.class_vars, self.metas, self.title_indices, self.text_features, "The Guardian", "Language", )
if __name__ == '__main__': key = os.getenv('THE_GUARDIAN_API_KEY', 'test') credentials = TheGuardianCredentials(key) print(credentials.valid) api = TheGuardianAPI(credentials=credentials) c = api.search('refugees', max_documents=10) print(c)