"""
Dataloader for dummy data.
"""
# Author: Georgios Douzas <gdouzas@icloud.com>
# License: MIT
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid
from ._base import _BaseDataLoader
[docs]class DummySoccerDataLoader(_BaseDataLoader):
"""Dataloader for dummy data.
The data are provided only for convenience, since they require
no downloading, and to familiarize the user with the methods
of the dataloader objects.
Read more in the :ref:`user guide <user_guide>`.
Parameters
----------
param_grid : dict of str to sequence, or sequence of such parameter, default=None
It selects the type of information that the data include. The keys of
dictionaries might be parameters like ``'league'`` or ``'division'`` while
the values are sequences of allowed values. It works in a similar way as the
``param_grid`` parameter of the :class:`~sklearn.model_selection.ParameterGrid`
class. The default value ``None`` corresponds to all parameters.
Examples
--------
>>> from sportsbet.datasets import DummySoccerDataLoader
>>> import pandas as pd
>>> # Get all available parameters to select the training data
>>> DummySoccerDataLoader.get_all_params()
[{'division': [1], 'league': ['France'], 'year': [2000]}, ...
>>> # Select only the traning data for the Spanish league
>>> dataloader = DummySoccerDataLoader(param_grid={'league': ['Spain']})
>>> # Get available odds types
>>> dataloader.get_odds_types()
['interwetten', 'williamhill']
>>> # Select the odds of Interwetten bookmaker
>>> X_train, Y_train, O_train = dataloader.extract_train_data(
... odds_type='interwetten')
>>> # Training input data
>>> print(X_train) # doctest: +NORMALIZE_WHITESPACE
division league year home_team ... williamhill__draw__odds
date
1997-05-04 1 Spain 1997 Real Madrid ... 2.5
1999-03-04 2 Spain 1999 Barcelona ... NaN
>>> # Training output data
>>> print(Y_train)
home_win__full_time_goals draw__full_time_goals away_win__full_time_goals
0 True False False
1 False True False
>>> # Training odds data
>>> print(O_train)
interwetten__home_win__odds interwetten__draw__odds interwetten__away_win__odds
0 1.5 3.5 2.5
1 2.5 4.5 2.0
>>> # Extract the corresponding fixtures data
>>> X_fix, Y_fix, O_fix = dataloader.extract_fixtures_data()
>>> # Training and fixtures input and odds data have the same column names
>>> pd.testing.assert_index_equal(X_train.columns, X_fix.columns)
>>> pd.testing.assert_index_equal(O_train.columns, O_fix.columns)
>>> # Fixtures data have always no output
>>> Y_fix is None
True
"""
DATE = pd.Timestamp(datetime.now()) + pd.to_timedelta(1, 'd')
PARAMS = ParameterGrid(
[
{'league': ['Greece'], 'division': [1], 'year': [2017, 2019]},
{'league': ['Spain'], 'division': [1], 'year': [1997]},
{'league': ['Spain'], 'division': [2], 'year': [1999]},
{'league': ['England'], 'division': [2], 'year': [1997]},
{'league': ['England'], 'division': [3], 'year': [1998]},
{'league': ['France'], 'division': [1], 'year': [2000]},
{'league': ['France'], 'division': [1], 'year': [2001]},
{'division': [1], 'year': [1998]},
]
)
SCHEMA = [
('division', int),
('league', object),
('date', np.datetime64),
('year', int),
('home_team', object),
('away_team', object),
('home_soccer_index', float),
('away_soccer_index', float),
('home_team__full_time_goals', int),
('away_team__full_time_goals', int),
('interwetten__home_win__odds', float),
('interwetten__draw__odds', float),
('interwetten__away_win__odds', float),
('williamhill__home_win__odds', float),
('williamhill__draw__odds', float),
('williamhill__away_win__odds', float),
('pinnacle__over_2.5__odds', float),
('pinnacle__under_2.5__odds', float),
]
OUTCOMES = [
(
'home_win__full_time_goals',
lambda outputs: outputs['home_team__full_time_goals']
> outputs['away_team__full_time_goals'],
),
(
'away_win__full_time_goals',
lambda outputs: outputs['home_team__full_time_goals']
< outputs['away_team__full_time_goals'],
),
(
'draw__full_time_goals',
lambda outputs: outputs['home_team__full_time_goals']
== outputs['away_team__full_time_goals'],
),
(
'over_2.5__full_time_goals',
lambda outputs: outputs['home_team__full_time_goals']
+ outputs['away_team__full_time_goals']
> 2.5,
),
(
'under_2.5__full_time_goals',
lambda outputs: outputs['home_team__full_time_goals']
+ outputs['away_team__full_time_goals']
< 2.5,
),
]
DATA = pd.DataFrame(
{
'division': [1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 3.0, 1.0, 4.0, 3.0, 1.0, 1.0],
'league': [
'Greece',
'Greece',
'Greece',
'Spain',
'Spain',
'England',
'England',
np.nan,
np.nan,
'France',
'France',
'France',
],
'date': [
pd.Timestamp('17/3/2017'),
np.nan,
pd.Timestamp('17/3/2019'),
pd.Timestamp('5/4/1997'),
pd.Timestamp('3/4/1999'),
pd.Timestamp('5/7/1997'),
pd.Timestamp('3/4/1998'),
pd.Timestamp('3/4/1998'),
DATE,
DATE,
pd.Timestamp('3/4/2000'),
pd.Timestamp('6/4/2001'),
],
'year': [
2017,
np.nan,
2019,
1997,
1999,
1997,
1998,
1998,
DATE.year,
DATE.year,
2000,
2001,
],
'home_team': [
'Olympiakos',
np.nan,
'Panathinaikos',
'Real Madrid',
'Barcelona',
'Arsenal',
'Liverpool',
'Liverpool',
'Barcelona',
'Monaco',
'Lens',
'PSG',
],
'away_team': [
'Panathinaikos',
np.nan,
'AEK',
'Barcelona',
'Real Madrid',
'Liverpool',
'Arsenal',
'Arsenal',
'Real Madrid',
'PSG',
'Monaco',
'Lens',
],
'home_team__full_time_goals': [
1,
np.nan,
1,
2,
2,
np.nan,
1,
1,
np.nan,
np.nan,
2,
1,
],
'away_team__full_time_goals': [
1,
np.nan,
0,
1,
2,
2,
3,
2,
np.nan,
np.nan,
1,
2,
],
'interwetten__home_win__odds': [
2.0,
1.5,
2,
1.5,
2.5,
3,
2,
np.nan,
3,
1.5,
2.0,
3.0,
],
'interwetten__draw__odds': [
2,
2,
2,
3.5,
4.5,
2.5,
4.5,
2.5,
2.5,
3.5,
2.5,
2.5,
],
'interwetten__away_win__odds': [
2,
2,
3,
2.5,
2,
2,
3.5,
3.5,
2,
2.5,
3.0,
2.0,
],
'williamhill__home_win__odds': [
2,
1.5,
3.5,
2.5,
2.0,
3.0,
2.0,
4.0,
3.5,
2.5,
2.5,
2.5,
],
'williamhill__draw__odds': [
2,
np.nan,
1.5,
2.5,
np.nan,
np.nan,
np.nan,
np.nan,
2.5,
1.5,
2.5,
3.0,
],
'williamhill__away_win__odds': [
2,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
2.0,
2.5,
3.0,
2.5,
],
'pinnacle__over_2.5__odds': [
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
],
'fixtures': [
False,
False,
False,
False,
False,
False,
False,
False,
True,
True,
False,
False,
],
}
)
def _get_data(self):
return self.DATA