Source code for sportsbet.datasets._soccer._fte

"""
Download and transform historical and fixtures data
for various leagues from FiveThirtyEight.

FiveThirtyEight: https://github.com/fivethirtyeight/data/tree/master/soccer-spi
"""

# Author: Georgios Douzas <gdouzas@icloud.com>
# License: MIT

from functools import lru_cache
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid

from . import OUTCOMES
from .._base import _BaseDataLoader, _read_csv

URL = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'
LEAGUES_MAPPING = {
    7921: ('FAWSL', 1),
    10281: ('Europa', 1),
    4582: ('NWSL', 1),
    9541: ('NWSL', 1),
    2160: ('United-Soccer-League', 1),
    1818: ('Champions-League', 1),
    1820: ('Europa-League', 1),
    1843: ('France', 1),
    2411: ('England', 1),
    1869: ('Spain', 1),
    1854: ('Italy', 1),
    1845: ('Germany', 1),
    1951: ('USA', 1),
    1874: ('Sweden', 1),
    1859: ('Norway', 1),
    2105: ('Brazil', 1),
    1866: ('Russia', 1),
    1952: ('Mexico', 1),
    1975: ('Mexico', 1),
    1827: ('Austria', 1),
    1879: ('Switzerland', 1),
    1844: ('France', 2),
    1846: ('Germany', 2),
    2412: ('England', 2),
    2417: ('Scotland', 1),
    1864: ('Portugal', 1),
    1849: ('Netherlands', 1),
    1882: ('Turkey', 1),
    1871: ('Spain', 2),
    1856: ('Italy', 2),
    5641: ('Argentina', 1),
    1837: ('Denmark', 1),
    1832: ('Belgium', 1),
    1947: ('Japan', 1),
    1979: ('China', 1),
    2413: ('England', 3),
    1983: ('South-Africa', 1),
    2414: ('England', 4),
    1884: ('Greece', 1),
    1948: ('Australia', 1),
}


def _extract_data():
    data = _read_csv(URL, parse_dates='date').copy()
    data[['league', 'division']] = pd.DataFrame(
        data['league_id'].apply(lambda lid: LEAGUES_MAPPING[lid]).values.tolist()
    )
    data['year'] = data['season'] + 1
    return data


[docs]class FTESoccerDataLoader(_BaseDataLoader): """Dataloader for FiveThirtyEight data. Read more in the :ref:`user guide <user_guide>`. Parameters ---------- param_grid : dict of str to sequence, or sequence of such parameter, default=None The parameter grid to explore, as a dictionary mapping data parameters to sequences of allowed values. An empty dict signifies default parameters. A sequence of dicts signifies a sequence of grids to search, and is useful to avoid exploring parameter combinations that do not exist. The default value corresponds to all parameters. Examples -------- >>> from sportsbet.datasets import FTESoccerDataLoader >>> dataloader = FTESoccerDataLoader() >>> X_train, Y_train, O_train = dataloader.extract_train_data() >>> X_fix, Y_fix, O_fix = dataloader.extract_fixtures_data() """ _removed_cols = ['season', 'league_id'] _cols_mapping = { 'team1': 'home_team', 'team2': 'away_team', 'date': 'date', 'spi1': 'home_team_soccer_power_index', 'spi2': 'away_team_soccer_power_index', 'prob1': 'home_team_probability_win', 'prob2': 'away_team_probability_win', 'probtie': 'probability_draw', 'proj_score1': 'home_team_projected_score', 'proj_score2': 'away_team_projected_score', 'importance1': 'home_team_match_importance', 'importance2': 'away_team_match_importance', 'score1': 'home_team__full_time_goals', 'score2': 'away_team__full_time_goals', 'xg1': 'home_team__full_time_shot_expected_goals', 'xg2': 'away_team__full_time_shot_expected_goals', 'nsxg1': 'home_team__full_time_non_shot_expected_goals', 'nsxg2': 'away_team__full_time_non_shot_expected_goals', 'adj_score1': 'home_team__full_time_adjusted_goals', 'adj_score2': 'away_team__full_time_adjusted_goals', } @classmethod def _get_schema(cls): return [ ('year', int), ('division', int), ('match_quality', float), ('league', object), ('home_team', object), ('away_team', object), ('date', np.datetime64), ('home_team_soccer_power_index', float), ('away_team_soccer_power_index', float), ('home_team_probability_win', float), ('away_team_probability_win', float), ('probability_draw', float), ('home_team_projected_score', float), ('away_team_projected_score', float), ('home_team_match_importance', float), ('away_team_match_importance', float), ('home_team__full_time_goals', int), ('away_team__full_time_goals', int), ('home_team__full_time_shot_expected_goals', float), ('away_team__full_time_shot_expected_goals', float), ('home_team__full_time_non_shot_expected_goals', float), ('away_team__full_time_non_shot_expected_goals', float), ('home_team__full_time_adjusted_goals', float), ('away_team__full_time_adjusted_goals', float), ] @classmethod def _get_outcomes(cls): return OUTCOMES @classmethod @lru_cache def _get_params(cls): data = _extract_data() full_param_grid = ( data[['league', 'division', 'year']].drop_duplicates().to_dict('records') ) return ParameterGrid( [ {name: [val] for name, val in params.items()} for params in full_param_grid ] ) @lru_cache def _get_data(self): data = _extract_data() data['match_quality'] = 2 / (1 / data['spi1'] + 1 / data['spi2']) data['fixtures'] = data['score1'].isna() & data['score2'].isna() data = data.drop(columns=self._removed_cols).rename(columns=self._cols_mapping) return data