# -*- coding: utf-8 -*-
"""
"""
import asyncio
import json
import logging
from datetime import datetime, timedelta
from itertools import chain
from pathlib import Path
from .api import CityIq
from .exceptions import CityIqError
logger = logging.getLogger(__name__)
[docs]class EventScraper(object):
event_locations_dir = 'event-locations'
def __init__(self, config, start_time, event_types):
self.config = config
self.start_time = start_time.replace(minute=0, second=0)
self.event_types = event_types
self.cache = Path(self.config.events_cache)
if not self.cache.exists() or not self.cache.is_dir():
raise CityIq("The cache dir ('{}') must exist and be a directory".format(self.cache))
[docs] def get_type_events(self, start_time, span, event_type):
"""
Get the events of one type
:param start_time:
:param span: time span in seconds
:param event_type:
:param tz_name:
:return:
"""
assert span <= 15 * 60, span
c = CityIq(self.config)
r = list(c.events(start_time=int(start_time), span=span, event_type=event_type))
return r
async def _get_events(self, start_time, span):
"""
Async get of all types of events.
"""
ts = start_time
max_span = 15 * 60
q, rem = divmod(span, max_span)
spans = [max_span] * q
if rem:
spans += [rem]
loop = asyncio.get_event_loop()
futures = []
for span in spans:
d = timedelta(seconds=span)
for event_type in self.event_types:
futures.append(loop.run_in_executor(None, self.get_type_events, ts.timestamp(), span, event_type))
ts = ts + d
group = asyncio.gather(*futures)
return await group
[docs] def get_events(self, start_time, span):
loop = asyncio.get_event_loop()
results = loop.run_until_complete(self._get_events(start_time, span))
return list(chain(*results))
[docs] def try_get_events(self, start_time, span):
"""Run get_events and try to be resilient to some errors"""
from requests.exceptions import HTTPError
import requests
from time import sleep
sleep_time = 20
last_e = None
for i in range(4):
try:
return self.get_events(start_time, span)
except HTTPError as e:
if e.response.status_code == requests.codes.SERVICE_UNAVAILABLE: # 503
logger.debug(f"ERROR {e}: will try again after {sleep_time} seconds")
sleep(sleep_time)
sleep_time *= 2
last_e = e
continue
else:
raise
else:
raise last_e
def _make_filename(self, st):
fn_base, _ = str(st.replace(tzinfo=None).isoformat()).split(':', 1)
fn = f'{fn_base}_{"_".join(sorted(self.event_types))}.json'
return self.cache.joinpath(fn)
[docs] def yield_file_names(self, start=None, end=None):
d = timedelta(hours=1)
if start is None:
start = self.start_time
if end is None:
end = datetime.now().astimezone(self.start_time.tzinfo)
while start < end:
fn_path = self._make_filename(start)
yield start, fn_path, fn_path.exists()
start += d
[docs] def yield_months(self):
'''Yield the results of yield_file_names'''
from dateutil.relativedelta import relativedelta
this_month = self.start_time.replace(day=1)
while this_month < datetime.now().astimezone(self.start_time.tzinfo):
next_month = this_month + relativedelta(months=1)
yield this_month, list(self.yield_file_names(this_month, next_month))
this_month = next_month
[docs] def scrape_events(self):
logger.debug("scrape: Starting at {} for events {}".format(self.start_time, self.event_types))
for st, fn_path, exists in self.yield_file_names():
if not exists:
logger.debug(f"{fn_path}: fetching")
r = self.try_get_events(st, 1 * 60 * 60)
with fn_path.open('w') as f:
json.dump(r, f)
logger.debug(f"{fn_path}: wrote")
else:
logger.debug(f"{fn_path}: exists")
[docs] def iterate_records(self, records=None):
"""For a set of file name records ( from yield_file_names), yield event objects """
if records is None:
records = self.yield_file_names()
for st, fn_path, exists in records:
try:
if exists:
with fn_path.open() as f:
o = json.load(f)
for e in o:
yield e
except Exception as e:
raise CityIqError("Failed to load scraped file {} : {}".format(str(fn_path), e))
[docs] def split_locations(self, use_tqdm=False):
"""Split the scraped event files into sperate files per month and location,
which are required for later stages of processing. """
from operator import itemgetter
import pandas as pd
keys = ['timestamp', 'locationUid', 'eventType']
ig = itemgetter(*keys)
if use_tqdm:
from tqdm.auto import tqdm
else:
def tqdm(g, *args, **kwargs):
yield from g
cache = Path(self.config.cache_dir).joinpath(self.event_locations_dir)
locations = set()
for m in tqdm(list(self.yield_months()), desc='Months'):
recs = []
for r in self.iterate_records(tqdm(m[1], desc='Build dataframe')):
recs.append(ig(r))
if not recs:
continue
df = pd.DataFrame(recs, columns=keys).sort_values('timestamp')
grp = df.groupby('locationUid')
for name, frame in tqdm(grp, desc='Iterate groups'):
locations.add(name)
cache.joinpath(name).mkdir(parents=True, exist_ok=True)
fn = cache.joinpath('{}/{}.csv'.format(name, m[0].date().isoformat()))
frame.to_csv(fn)
[docs] def iterate_splits(self, use_tqdm=False, locations=None):
"""Iterate over the splits produced by split_locations"""
cache = Path(self.config.cache_dir).joinpath(self.event_locations_dir)
if not locations:
locations = [e.name for e in cache.glob('*')]
if use_tqdm:
from tqdm.auto import tqdm
locations = tqdm(locations)
for location in locations:
locations_dir = cache.joinpath(location)
if locations_dir.is_dir():
yield location, [e for e in locations_dir.glob('*.csv')]