"""
Download and transform historical and fixtures data
for various leagues from Football-Data.co.uk.
Football-Data.co.uk: http://www.football-data.co.uk/data.php
"""
# Author: Georgios Douzas <gdouzas@icloud.com>
# License: MIT
from urllib.request import urlopen, urljoin
from datetime import datetime
from os.path import join
from functools import lru_cache
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from rich.progress import track
from sklearn.model_selection import ParameterGrid
from ._utils import OUTPUTS, _read_csv
from .._base import _BaseDataLoader
URL = 'https://www.football-data.co.uk'
BASE_URLS = [
'englandm.php',
'scotlandm.php',
'germanym.php',
'italym.php',
'spainm.php',
'francem.php',
'netherlandsm.php',
'belgiumm.php',
'portugalm.php',
'turkeym.php',
'greecem.php',
'Argentina.php',
'Austria.php',
'Brazil.php',
'China.php',
'Denmark.php',
'Finland.php',
'Ireland.php',
'Japan.php',
'Mexico.php',
'Norway.php',
'Poland.php',
'Romania.php',
'Russia.php',
'Sweden.php',
'Switzerland.php',
'USA.php',
]
LEAGUES_MAPPING = {
'England': ('E', '0', '1', '2', '3', 'C'),
'Scotland': ('SC', '0', '1', '2', '3', 'C'),
'Germany': ('D', '1', '2'),
'Italy': ('I', '1', '2'),
'Spain': ('SP', '1', '2'),
'France': ('F', '1', '2'),
'Netherlands': ('N', '1'),
'Belgium': ('B', '1'),
'Portugal': ('P', '1'),
'Turkey': ('T', '1'),
'Greece': ('G', '1'),
'Argentina': ('ARG', '1'),
'Austria': ('AUT', '1'),
'Brazil': ('BRA', '1'),
'China': ('CHN', '1'),
'Denmark': ('DNK', '1'),
'Finland': ('FIN', '1'),
'Ireland': ('IRL', '1'),
'Japan': ('JPN', '1'),
'Mexico': ('MEX', '1'),
'Norway': ('NOR', '1'),
'Poland': ('POL', '1'),
'Romania': ('ROU', '1'),
'Russia': ('RUS', '1'),
'Sweden': ('SWE', '1'),
'Switzerland': ('SWZ', '1'),
'USA': ('USA', '1'),
}
REMOVED_COLS = [
'Div',
'Country',
'Season',
'Time',
'FTR',
'Res',
'Attendance',
'Referee',
'HTR',
'BbAH',
'Bb1X2',
'BbOU',
'League',
'divisions',
]
COLS_MAPPING = {
'HT': 'home_team',
'Home': 'home_team',
'AT': 'away_team',
'Away': 'away_team',
'LB': 'odds__ladbrokes__home_win__full_time_goals',
'LB.1': 'odds__ladbrokes__draw__full_time_goals',
'LB.2': 'odds__ladbrokes__away_win__full_time_goals',
'PH': 'odds__pinnacle__home_win__full_time_goals',
'PD': 'odds__pinnacle__draw__full_time_goals',
'PA': 'odds__pinnacle__away_win__full_time_goals',
'HG': 'home_team__full_time_goals',
'AG': 'away_team__full_time_goals',
'HomeTeam': 'home_team',
'AwayTeam': 'away_team',
'Date': 'date',
'B365AH': 'odds__bet365__size_of_asian_handicap_home_team__full_time_goals',
'LBAH': 'odds__ladbrokes__size_of_asian_handicap_home_team__full_time_goals',
'BbAHh': 'odds__betbrain__size_of_asian_handicap_home_team__full_time_goals',
'GBAH': 'odds__gamebookers__size_of_handicap_home_team__full_time_goals',
'AHh': 'odds__market_average__size_of_handicap_home_team__full_time_goals',
'AHCh': 'odds__market_average_closing__size_of_asian_handicap_home_team__full_time_goals',
'B365H': 'odds__bet365__home_win__full_time_goals',
'B365D': 'odds__bet365__draw__full_time_goals',
'B365A': 'odds__bet365__away_win__full_time_goals',
'B365>2.5': 'odds__bet365__over_2.5__full_time_goals',
'B365<2.5': 'odds__bet365__under_2.5__full_time_goals',
'B365AHH': 'odds__bet365__asian_handicap_home_team__full_time_goals',
'B365AHA': 'odds__bet365__asian_handicap_away_team__full_time_goals',
'B365CH': 'odds__bet365_closing__home_win__full_time_goals',
'B365CD': 'odds__bet365_closing__draw__full_time_goals',
'B365CA': 'odds__bet365_closing__away_win__full_time_goals',
'B365C>2.5': 'odds__bet365_closing__over_2.5__full_time_goals',
'B365C<2.5': 'odds__bet365_closing__under_2.5__full_time_goals',
'B365CAHH': 'odds__bet365_closing__asian_handicap_home_team__full_time_goals',
'B365CAHA': 'odds__bet365_closing__asian_handicap_away_team__full_time_goals',
'BbMxH': 'odds__betbrain_maximum__home_win__full_time_goals',
'BbMxD': 'odds__betbrain_maximum__draw__full_time_goals',
'BbMxA': 'odds__betbrain_maximum__away_win__full_time_goals',
'BbMx>2.5': 'odds__betbrain_maximum__over_2.5__full_time_goals',
'BbMx<2.5': 'odds__betbrain_maximum__under_2.5__full_time_goals',
'BbMxAHH': 'odds__betbrain_maximum__asian_handicap_home_team__full_time_goals',
'BbMxAHA': 'odds__betbrain_maximum__asian_handicap_away_team__full_time_goals',
'BbAvH': 'odds__betbrain_average__home_win__full_time_goals',
'BbAvD': 'odds__betbrain_average__draw_win__full_time_goals',
'BbAvA': 'odds__betbrain_average__away_win__full_time_goals',
'BbAv>2.5': 'odds__betbrain_average__over_2.5__full_time_goals',
'BbAv<2.5': 'odds__betbrain_average__under_2.5__full_time_goals',
'BbAvAHH': 'odds__betbrain_average__asian_handicap_home_team__full_time_goals',
'BbAvAHA': 'odds__betbrain_average__asian_handicap_away_team__full_time_goals',
'BWH': 'odds__betwin__home_win__full_time_goals',
'BWD': 'odds__betwin__draw__full_time_goals',
'BWA': 'odds__betwin__away_win__full_time_goals',
'BWCH': 'odds__betwin_closing__home_win__full_time_goals',
'BWCD': 'odds__betwin_closing__draw__full_time_goals',
'BWCA': 'odds__betwin_closing__away_win__full_time_goals',
'BSH': 'odds__bluesquare__home_win__full_time_goals',
'BSD': 'odds__bluesquare__draw__full_time_goals',
'BSA': 'odds__bluesquare__away_win__full_time_goals',
'GBH': 'odds__gamebookers__home_win__full_time_goals',
'GBD': 'odds__gamebookers__draw__full_time_goals',
'GBA': 'odds__gamebookers__away_win__full_time_goals',
'GB>2.5': 'odds__gamebookers__over_2.5__full_time_goals',
'GB<2.5': 'odds__gamebookers__under_2.5__full_time_goals',
'GBAHH': 'odds__gamebookers__asian_handicap_home_team__full_time_goals',
'GBAHA': 'odds__gamebookers__asian_handicap_away_team__full_time_goals',
'IWH': 'odds__interwetten__home_win__full_time_goals',
'IWD': 'odds__interwetten__draw__full_time_goals',
'IWA': 'odds__interwetten__away_win__full_time_goals',
'IWCH': 'odds__interwetten_closing__home_win__full_time_goals',
'IWCD': 'odds__interwetten_closing__draw__full_time_goals',
'IWCA': 'odds__interwetten_closing__away_win__full_time_goals',
'LBH': 'odds__ladbrokes__home_win__full_time_goals',
'LBD': 'odds__ladbrokes__draw__full_time_goals',
'LBA': 'odds__ladbrokes__away_win__full_time_goals',
'LBAHH': 'odds__ladbrokes__asian_handicap_home_team__full_time_goals',
'LBAHA': 'odds__ladbrokes__asian_handicap_away_team__full_time_goals',
'PSH': 'odds__pinnacle__home_win__full_time_goals',
'PSD': 'odds__pinnacle__draw__full_time_goals',
'PSA': 'odds__pinnacle__away_win__full_time_goals',
'P>2.5': 'odds__pinnacle__over_2.5__full_time_goals',
'P<2.5': 'odds__pinnacle__under_2.5__full_time_goals',
'PAHH': 'odds__pinnacle__asian_handicap_home_team__full_time_goals',
'PAHA': 'odds__pinnacle__asian_handicap_away_team__full_time_goals',
'PSCH': 'odds__pinnacle_closing__home_win__full_time_goals',
'PSCD': 'odds__pinnacle_closing__draw__full_time_goals',
'PSCA': 'odds__pinnacle_closing__away_win__full_time_goals',
'PC>2.5': 'odds__pinnacle_closing__over_2.5__full_time_goals',
'PC<2.5': 'odds__pinnacle_closing__under_2.5__full_time_goals',
'PCAHH': 'odds__pinnacle_closing__asian_handicap_home_team__full_time_goals',
'PCAHA': 'odds__pinnacle_closing__asian_handicap_away_team__full_time_goals',
'SOH': 'odds__sporting__home_win__full_time_goals',
'SOD': 'odds__sporting__draw__full_time_goals',
'SOA': 'odds__sporting__away_win__full_time_goals',
'SBH': 'odds__sportingbet__home_win__full_time_goals',
'SBD': 'odds__sportingbet__draw__full_time_goals',
'SBA': 'odds__sportingbet__away_win__full_time_goals',
'SJH': 'odds__stanjames__home_win__full_time_goals',
'SJD': 'odds__stanjames__draw__full_time_goals',
'SJA': 'odds__stanjames__away_win__full_time_goals',
'SYH': 'odds__stanleybet__home_win__full_time_goals',
'SYD': 'odds__stanleybet__draw__full_time_goals',
'SYA': 'odds__stanleybet__away_win__full_time_goals',
'VCH': 'odds__vcbet__home_win__full_time_goals',
'VCD': 'odds__vcbet__draw__full_time_goals',
'VCA': 'odds__vcbet__away_win__full_time_goals',
'VCCH': 'odds__vcbet_closing__home_win__full_time_goals',
'VCCD': 'odds__vcbet_closing__draw__full_time_goals',
'VCCA': 'odds__vcbet_closing__away_win__full_time_goals',
'WHH': 'odds__williamhill__home_win__full_time_goals',
'WHD': 'odds__williamhill__draw__full_time_goals',
'WHA': 'odds__williamhill__away_win__full_time_goals',
'WHCH': 'odds__williamhill_closing__home_win__full_time_goals',
'WHCD': 'odds__williamhill_closing__draw__full_time_goals',
'WHCA': 'odds__williamhill_closing__away_win__full_time_goals',
'MaxH': 'odds__market_maximum__home_win__full_time_goals',
'MaxD': 'odds__market_maximum__draw__full_time_goals',
'MaxA': 'odds__market_maximum__away_win__full_time_goals',
'Max>2.5': 'odds__market_maximum__over_2.5__full_time_goals',
'Max<2.5': 'odds__market_maximum__under_2.5__full_time_goals',
'MaxAHH': 'odds__market_maximum__asian_handicap_home_team__full_time_goals',
'MaxAHA': 'odds__market_maximum__asian_handicap_away_team__full_time_goals',
'MaxCH': 'odds__market_maximum_closing__home_win__full_time_goals',
'MaxCD': 'odds__market_maximum_closing__draw__full_time_goals',
'MaxCA': 'odds__market_maximum_closing__away_win__full_time_goals',
'MaxC>2.5': 'odds__market_maximum_closing__over_2.5__full_time_goals',
'MaxC<2.5': 'odds__market_maximum_closing__under_2.5__full_time_goals',
'MaxCAHH': 'odds__market_maximum_closing__asian_handicap_home_team__full_time_goals',
'MaxCAHA': 'odds__market_maximum_closing__asian_handicap_away_team__full_time_goals',
'AvgH': 'odds__market_average__home_win__full_time_goals',
'AvgD': 'odds__market_average__draw__full_time_goals',
'AvgA': 'odds__market_average__away_win__full_time_goals',
'Avg>2.5': 'odds__market_average__over_2.5__full_time_goals',
'Avg<2.5': 'odds__market_average__under_2.5__full_time_goals',
'AvgAHH': 'odds__market_average__asian_handicap_home_team__full_time_goals',
'AvgAHA': 'odds__market_average__asian_handicap_away_team__full_time_goals',
'AvgCH': 'odds__market_average_closing__home_win__full_time_goals',
'AvgCD': 'odds__market_average_closing__draw__full_time_goals',
'AvgCA': 'odds__market_average_closing__away_win__full_time_goals',
'AvgC>2.5': 'odds__market_average_closing__over_2.5__full_time_goals',
'AvgC<2.5': 'odds__market_average_closing__under_2.5__full_time_goals',
'AvgCAHH': 'odds__market_average_closing__asian_handicap_home_team__full_time_goals',
'AvgCAHA': 'odds__market_average_closing__asian_handicap_away_team__full_time_goals',
'FTHG': 'target__home_team__full_time_goals',
'FTAG': 'target__away_team__full_time_goals',
'HTHG': 'target__home_team__half_time_goals',
'HTAG': 'target__away_team__half_time_goals',
'HS': 'target__home_team__shots',
'AS': 'target__away_team__shots',
'HST': 'target__home_team__shots_on_target',
'AST': 'target__away_team__shots_on_target',
'HHW': 'target__home_team__hit_woodork',
'AHW': 'target__away_team__hit_woodork',
'HC': 'target__home_team__corners',
'AC': 'target__away_team__corners',
'HF': 'target__home_team__fouls_committed',
'AF': 'target__away_team__fouls_committed',
'HFKC': 'target__home_team__free_kicks_conceded',
'AFKC': 'target__away_team__free_kicks_conceded',
'HO': 'target__home_team__offsides',
'AO': 'target__away_team__offsides',
'HY': 'target__home_team__yellow_cards',
'AY': 'target__away_team__yellow_cards',
'HR': 'target__home_team__red_cards',
'AR': 'target__away_team__red_cards',
'HBP': 'target__home_team__bookings_points',
'ABP': 'target__away_team__bookings_points',
}
def _convert_base_url_to_league(base_url):
league = base_url.replace('.php', '')
if base_url[0].islower():
league = league[:-1].capitalize()
return league
def _extract_csv_urls(base_url):
html = urlopen(urljoin(URL, base_url))
bsObj = BeautifulSoup(html.read(), features='html.parser')
return {
el.get('href') for el in bsObj.find_all('a') if el.get('href').endswith('csv')
}
def _param_grid_to_csv_urls(param_grid):
urls = []
for params in param_grid:
in_main_leagues = f'{params["league"].lower()}m.php' in BASE_URLS
encoded_league, *divisions = LEAGUES_MAPPING[params['league']]
if in_main_leagues:
year = f'{str(params["year"] - 1)[2:]}{str(params["year"])[2:]}'
if '0' in divisions:
division = (
str(params['division'] - 1) if params['division'] != 5 else 'C'
)
else:
division = str(params['division'])
urls.append(
(params, join(URL, 'mmz4281', year, f'{encoded_league}{division}.csv'))
)
else:
urls.append((params, join(URL, 'new', f'{encoded_league}.csv')))
return urls
@lru_cache
def _get_params():
full_param_grid = []
for base_url in BASE_URLS:
league = _convert_base_url_to_league(base_url)
divisions = LEAGUES_MAPPING[league][1:]
urls = _extract_csv_urls(base_url)
for url in urls:
if base_url[0].islower():
_, year, division = url.split('/')
year = datetime.strptime(year[2:], '%y').year
division = division.replace('.csv', '')[-1]
param_grid = {
'league': [league],
'division': [
int(division) + int('0' in divisions) if division != 'C' else 5
],
'year': [year],
}
else:
years = _read_csv(urljoin(URL, url))['Season']
years = list(
{
season + 1
if type(season) is not str
else int(season.split('/')[-1])
for season in years.unique()
}
)
param_grid = {'league': [league], 'division': [1], 'year': years}
full_param_grid.append(param_grid)
return ParameterGrid(full_param_grid)
[docs]class FDSoccerDataLoader(_BaseDataLoader):
"""Dataloader for Football-Data.co.uk soccer data.
It downloads historical and fixtures data from
`Football-Data.co.uk <http://www.football-data.co.uk/data.php>`_.
Read more in the :ref:`user guide <user_guide>`.
Parameters
----------
param_grid : dict of str to sequence, or sequence of such parameter, default=None
It selects the type of information that the data include. The keys of
dictionaries might be parameters like ``'league'`` or ``'division'`` while
the values are sequences of allowed values. It works in a similar way as the
``param_grid`` parameter of the :class:`~sklearn.model_selection.ParameterGrid`
class. The default value ``None`` corresponds to all parameters.
Examples
--------
>>> from sportsbet.datasets import FDSoccerDataLoader
>>> import pandas as pd
>>> # Get all available parameters to select the training data
>>> FDSoccerDataLoader.get_all_params()
[{'division': 1, 'league': 'Argentina', 'year': 2013}, ...
>>> # Select only the traning data for the English league and 2020, 2021 years
>>> dataloader = FDSoccerDataLoader(
... param_grid={'league': ['England'], 'year': [2020, 2021]})
>>> # Get available odds types
>>> dataloader.get_odds_types()
Football-Data.co.uk...
[..., 'market_average', ...]
>>> # Select the market average odds and drop colums with missing values
>>> X_train, Y_train, O_train = dataloader.extract_train_data(
... odds_type='market_average', drop_na_thres=1.0)
>>> # Odds data include the selected market average odds
>>> O_train.columns
Index(['odds__market_average__home_win__full_time_goals', ...
>>> # Extract the corresponding fixtures data
>>> X_fix, Y_fix, O_fix = dataloader.extract_fixtures_data()
>>> # Training and fixtures input and odds data have the same column names
>>> pd.testing.assert_index_equal(X_train.columns, X_fix.columns)
>>> pd.testing.assert_index_equal(O_train.columns, O_fix.columns)
>>> # Fixtures data have always no output
>>> Y_fix is None
True
"""
SCHEMA = [
('league', object),
('division', int),
('year', int),
('home_team', object),
('away_team', object),
('date', np.datetime64),
('odds__bet365__home_win__full_time_goals', float),
('odds__bet365__draw__full_time_goals', float),
('odds__bet365__away_win__full_time_goals', float),
('odds__bet365__over_2.5__full_time_goals', float),
('odds__bet365__under_2.5__full_time_goals', float),
('odds__bet365__asian_handicap_home_team__full_time_goals', float),
('odds__bet365__asian_handicap_away_team__full_time_goals', float),
('odds__bet365_closing__home_win__full_time_goals', float),
('odds__bet365_closing__draw__full_time_goals', float),
('odds__bet365_closing__away_win__full_time_goals', float),
('odds__bet365_closing__over_2.5__full_time_goals', float),
('odds__bet365_closing__under_2.5__full_time_goals', float),
('odds__bet365_closing__asian_handicap_home_team__full_time_goals', float),
('odds__bet365_closing__asian_handicap_away_team__full_time_goals', float),
('odds__bet365__size_of_asian_handicap_home_team__full_time_goals', object),
('odds__betbrain_maximum__home_win__full_time_goals', float),
('odds__betbrain_maximum__draw__full_time_goals', float),
('odds__betbrain_maximum__away_win__full_time_goals', float),
('odds__betbrain_maximum__over_2.5__full_time_goals', float),
('odds__betbrain_maximum__under_2.5__full_time_goals', float),
('odds__betbrain_maximum__asian_handicap_home_team__full_time_goals', float),
('odds__betbrain_maximum__asian_handicap_away_team__full_time_goals', float),
('odds__betbrain_average__home_win__full_time_goals', float),
('odds__betbrain_average__draw_win__full_time_goals', float),
('odds__betbrain_average__away_win__full_time_goals', float),
('odds__betbrain_average__over_2.5__full_time_goals', float),
('odds__betbrain_average__under_2.5__full_time_goals', float),
('odds__betbrain_average__asian_handicap_home_team__full_time_goals', float),
('odds__betbrain_average__asian_handicap_away_team__full_time_goals', float),
('odds__betbrain__size_of_asian_handicap_home_team__full_time_goals', object),
('odds__betwin__home_win__full_time_goals', float),
('odds__betwin__draw__full_time_goals', float),
('odds__betwin__away_win__full_time_goals', float),
('odds__betwin_closing__home_win__full_time_goals', float),
('odds__betwin_closing__draw__full_time_goals', float),
('odds__betwin_closing__away_win__full_time_goals', float),
('odds__bluesquare__home_win__full_time_goals', float),
('odds__bluesquare__draw__full_time_goals', float),
('odds__bluesquare__away_win__full_time_goals', float),
('odds__gamebookers__home_win__full_time_goals', float),
('odds__gamebookers__draw__full_time_goals', float),
('odds__gamebookers__away_win__full_time_goals', float),
('odds__gamebookers__over_2.5__full_time_goals', float),
('odds__gamebookers__under_2.5__full_time_goals', float),
('odds__gamebookers__asian_handicap_home_team__full_time_goals', float),
('odds__gamebookers__asian_handicap_away_team__full_time_goals', float),
('odds__gamebookers__size_of_handicap_home_team__full_time_goals', object),
('odds__interwetten__home_win__full_time_goals', float),
('odds__interwetten__draw__full_time_goals', float),
('odds__interwetten__away_win__full_time_goals', float),
('odds__interwetten_closing__home_win__full_time_goals', float),
('odds__interwetten_closing__draw__full_time_goals', float),
('odds__interwetten_closing__away_win__full_time_goals', float),
('odds__ladbrokes__home_win__full_time_goals', float),
('odds__ladbrokes__draw__full_time_goals', float),
('odds__ladbrokes__away_win__full_time_goals', float),
('odds__ladbrokes__asian_handicap_home_team__full_time_goals', float),
('odds__ladbrokes__asian_handicap_away_team__full_time_goals', float),
('odds__ladbrokes__size_of_asian_handicap_home_team__full_time_goals', object),
('odds__pinnacle__home_win__full_time_goals', float),
('odds__pinnacle__draw__full_time_goals', float),
('odds__pinnacle__away_win__full_time_goals', float),
('odds__pinnacle__over_2.5__full_time_goals', float),
('odds__pinnacle__under_2.5__full_time_goals', float),
('odds__pinnacle__asian_handicap_home_team__full_time_goals', float),
('odds__pinnacle__asian_handicap_away_team__full_time_goals', float),
('odds__pinnacle_closing__home_win__full_time_goals', float),
('odds__pinnacle_closing__draw__full_time_goals', float),
('odds__pinnacle_closing__away_win__full_time_goals', float),
('odds__pinnacle_closing__over_2.5__full_time_goals', float),
('odds__pinnacle_closing__under_2.5__full_time_goals', float),
('odds__pinnacle_closing__asian_handicap_home_team__full_time_goals', float),
('odds__pinnacle_closing__asian_handicap_away_team__full_time_goals', float),
('odds__sporting__home_win__full_time_goals', float),
('odds__sporting__draw__full_time_goals', float),
('odds__sporting__away_win__full_time_goals', float),
('odds__sportingbet__home_win__full_time_goals', float),
('odds__sportingbet__draw__full_time_goals', float),
('odds__sportingbet__away_win__full_time_goals', float),
('odds__stanjames__home_win__full_time_goals', float),
('odds__stanjames__draw__full_time_goals', float),
('odds__stanjames__away_win__full_time_goals', float),
('odds__stanleybet__home_win__full_time_goals', float),
('odds__stanleybet__draw__full_time_goals', float),
('odds__stanleybet__away_win__full_time_goals', float),
('odds__vcbet__home_win__full_time_goals', float),
('odds__vcbet__draw__full_time_goals', float),
('odds__vcbet__away_win__full_time_goals', float),
('odds__vcbet_closing__home_win__full_time_goals', float),
('odds__vcbet_closing__draw__full_time_goals', float),
('odds__vcbet_closing__away_win__full_time_goals', float),
('odds__williamhill__home_win__full_time_goals', float),
('odds__williamhill__draw__full_time_goals', float),
('odds__williamhill__away_win__full_time_goals', float),
('odds__williamhill_closing__home_win__full_time_goals', float),
('odds__williamhill_closing__draw__full_time_goals', float),
('odds__williamhill_closing__away_win__full_time_goals', float),
('odds__market_maximum__home_win__full_time_goals', float),
('odds__market_maximum__draw__full_time_goals', float),
('odds__market_maximum__away_win__full_time_goals', float),
('odds__market_maximum__over_2.5__full_time_goals', float),
('odds__market_maximum__under_2.5__full_time_goals', float),
('odds__market_maximum__asian_handicap_home_team__full_time_goals', float),
('odds__market_maximum__asian_handicap_away_team__full_time_goals', float),
('odds__market_maximum_closing__home_win__full_time_goals', float),
('odds__market_maximum_closing__draw__full_time_goals', float),
('odds__market_maximum_closing__away_win__full_time_goals', float),
('odds__market_maximum_closing__over_2.5__full_time_goals', float),
('odds__market_maximum_closing__under_2.5__full_time_goals', float),
(
'odds__market_maximum_closing__asian_handicap_home_team__full_time_goals',
float,
),
(
'odds__market_maximum_closing__asian_handicap_away_team__full_time_goals',
float,
),
('odds__market_average__home_win__full_time_goals', float),
('odds__market_average__draw__full_time_goals', float),
('odds__market_average__away_win__full_time_goals', float),
('odds__market_average__over_2.5__full_time_goals', float),
('odds__market_average__under_2.5__full_time_goals', float),
('odds__market_average__asian_handicap_home_team__full_time_goals', float),
('odds__market_average__asian_handicap_away_team__full_time_goals', float),
('odds__market_average_closing__home_win__full_time_goals', float),
('odds__market_average_closing__draw__full_time_goals', float),
('odds__market_average_closing__away_win__full_time_goals', float),
('odds__market_average_closing__over_2.5__full_time_goals', float),
('odds__market_average_closing__under_2.5__full_time_goals', float),
(
'odds__market_average_closing__asian_handicap_home_team__full_time_goals',
float,
),
(
'odds__market_average_closing__asian_handicap_away_team__full_time_goals',
float,
),
('odds__market_average__size_of_handicap_home_team__full_time_goals', object),
(
'odds__market_average_closing__size_of_asian_handicap_home_team__full_time_goals',
object,
),
('target__home_team__full_time_goals', int),
('target__away_team__full_time_goals', int),
('target__home_team__half_time_goals', int),
('target__away_team__half_time_goals', int),
('target__home_team__shots', int),
('target__away_team__shots', int),
('target__home_team__shots_on_target', int),
('target__away_team__shots_on_target', int),
('target__home_team__hit_woodork', int),
('target__away_team__hit_woodork', int),
('target__home_team__corners', int),
('target__away_team__corners', int),
('target__home_team__fouls_committed', int),
('target__away_team__fouls_committed', int),
('target__home_team__free_kicks_conceded', int),
('target__away_team__free_kicks_conceded', int),
('target__home_team__offsides', int),
('target__away_team__offsides', int),
('target__home_team__yellow_cards', int),
('target__away_team__yellow_cards', int),
('target__home_team__red_cards', int),
('target__away_team__red_cards', int),
('target__home_team__bookings_points', float),
('target__away_team__bookings_points', float),
]
OUTPUTS = OUTPUTS
@classmethod
@property
def PARAMS(cls):
return _get_params()
@lru_cache
def _get_data(self):
self._check_param_grid()
# Training data
data_container = []
urls = _param_grid_to_csv_urls(self.param_grid_)
for params, url in track(urls, description='Football-Data.co.uk:'):
data = _read_csv(url).replace('#REF!', np.nan)
try:
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
except ValueError:
data['Date'] = pd.to_datetime(data['Date'], infer_datetime_format=True)
if url.split('/')[-2] != 'new':
data = data.assign(
league=params['league'],
division=params['division'],
year=params['year'],
fixtures=False,
)
else:
data = data.assign(
league=params['league'], division=params['division'], fixtures=False
)
data['year'] = data['Season'].apply(
lambda season: season + 1
if type(season) is not str
else int(season.split('/')[-1])
)
data = data[data.year == params['year']]
data = data.drop(
columns=[
col
for col in data.columns
if 'Unnamed' in col or col in REMOVED_COLS
],
).rename(columns=COLS_MAPPING)
data_container.append(data)
# Fixtures data
data = _read_csv(join(URL, 'fixtures.csv'))
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y')
data = data.dropna(axis=0, how='any', subset=['Div', 'HomeTeam', 'AwayTeam'])
data['fixtures'] = True
inv_leagues_mapping = {v[0]: k for k, v in LEAGUES_MAPPING.items()}
data['league'] = data['Div'].apply(lambda div: inv_leagues_mapping[div[:-1]])
data['division'] = data['Div'].apply(lambda div: div[-1])
data['divisions'] = data['league'].apply(
lambda league: LEAGUES_MAPPING[league][1:]
)
data['division'] = (
data[['division', 'divisions']]
.apply(
lambda row: row[0]
if 'C' not in row[1]
else (row[0] - 1 if isinstance(row[0], int) else 4),
axis=1,
)
.astype(int)
)
years = (
pd.DataFrame(self.PARAMS).groupby(['league', 'division']).max()
).reset_index()
data = pd.merge(data, years, how='left')
data = data.drop(
columns=[
col for col in data.columns if 'Unnamed' in col or col in REMOVED_COLS
]
).rename(columns=COLS_MAPPING)
data_container.append(data)
# Combine data
data = pd.concat(data_container, ignore_index=True)
return data.sort_values(['league', 'division', 'year'], ignore_index=True)