"""
Download and transform historical and fixtures data
for various leagues from Football-Data.co.uk and FiveThirtyEight.
Football-Data.co.uk: http://www.football-data.co.uk/data.php
FiveThirtyEight: https://github.com/fivethirtyeight/data/tree/master/soccer-spi
"""
# Author: Georgios Douzas <gdouzas@icloud.com>
# License: MIT
from functools import lru_cache
import pandas as pd
from sklearn.model_selection import ParameterGrid
from .._base import _BaseDataLoader
from ._utils import OUTCOMES
from ._fd import FDSoccerDataLoader
from ._fte import FTESoccerDataLoader
[docs]class SoccerDataLoader(_BaseDataLoader):
"""Dataloader for soccer data from combining all data sources.
It downloads historical and fixtures data from
`Football-Data.co.uk <http://www.football-data.co.uk/data.php>`_ and
`FiveThirtyEight <https://github.com/fivethirtyeight/data/tree/master/soccer-spi>`_.
The data are combined in a consistent way.
Read more in the :ref:`user guide <user_guide>`.
Parameters
----------
param_grid : dict of str to sequence, or sequence of such parameter, default=None
It selects the type of information that the data include. The keys of
dictionaries might be parameters like ``'league'`` or ``'division'`` while
the values are sequences of allowed values. It works in a similar way as the
``param_grid`` parameter of the :class:`~sklearn.model_selection.ParameterGrid`
class. The default value ``None`` corresponds to all parameters.
Examples
--------
>>> from sportsbet.datasets import SoccerDataLoader
>>> import pandas as pd
>>> # Get all available parameters to select the training data
>>> pd.DataFrame(SoccerDataLoader.get_all_params()).sort_values(
... ['league', 'year', 'division']).reset_index(drop=True)
division league year
0 1 Argentina 2013
1 1 Argentina 2014
2 1 Argentina 2015
3 1 Argentina 2016
4 1 Argentina 2017
...
>>> # Select only the traning data for the French and Spanish leagues of 2020 year
>>> dataloader = SoccerDataLoader(
... param_grid={'league': ['England', 'Spain'], 'year':[2020]})
>>> # Get available odds types
>>> dataloader.get_odds_types()
[..., 'market_average', ...]
>>> # Select the market average odds and drop colums with missing values
>>> X_train, Y_train, O_train = dataloader.extract_train_data(
... odds_type='market_average', drop_na_thres=1.0)
Football-Data.co.uk...
>>> # Extract the corresponding fixtures data
>>> X_fix, Y_fix, O_fix = dataloader.extract_fixtures_data()
>>> # Training and fixtures input and odds data have the same column names
>>> pd.testing.assert_index_equal(X_train.columns, X_fix.columns)
>>> pd.testing.assert_index_equal(O_train.columns, O_fix.columns)
>>> # Fixtures data have always no output
>>> Y_fix is None
True
>>> # Odds data include the selected market average odds
>>> O_train
market_average__away_win__odds ... market_average__under_2.5__odds
0 3.19 ... 1.76
...
"""
_names_mapping = {
'SK Austria Klagenfurt': 'A. Klagenfurt',
'AEK Athens': 'AEK',
'AaB': 'Aalborg',
'AGF Aarhus': 'Aarhus',
'Accrington Stanley': 'Accrington',
'Adana Demirspor': 'Ad. Demirspor',
'FC Trenkwalder Admira': 'Admira',
'AC Ajaccio': 'Ajaccio',
'GFC Ajaccio': 'Ajaccio GFCO',
'Akhisar Belediye': 'Akhisar Belediyespor',
'Terek Grozny': 'Akhmat Grozny',
'Alavés': 'Alaves',
'AD Alcorcon': 'Alcorcon',
'Cashpoint SC Rheindorf Altach': 'Altach',
'América Mineiro': 'America MG',
'Amkar Perm': 'Amkar',
'Apollon Smyrni': 'Apollon',
'Argentinos Juniors': 'Argentinos Jrs',
'Aris Salonika': 'Aris',
'FC Arouca': 'Arouca',
'FC Arsenal Tula': 'Arsenal Tula',
'Athletic Bilbao': 'Ath Bilbao',
'Atletico Madrid': 'Ath Madrid',
'Atlético San Luis': 'Atl. San Luis',
'Atlético Tucumán': 'Atl. Tucuman',
'Atlanta United FC': 'Atlanta United',
'Atletico Mineiro': 'Atletico-MG',
'Atlético Paranaense': 'Atletico-PR',
'FC Augsburg': 'Augsburg',
'FK Austria Vienna': 'Austria Vienna',
'AvaÃ\xad': 'Avai',
'BahÃ\xada': 'Bahia',
'FC Barcelona II': 'Barcelona B',
'KFCO Beerschot-Wilrijk': 'Beerschot VA',
'Guizhou Renhe': 'Beijing Renhe',
'Belgrano Cordoba': 'Belgrano',
'Real Betis': 'Betis',
'Beziers AS': 'Beziers',
'Arminia Bielefeld': 'Bielefeld',
'VfL Bochum': 'Bochum',
'Botafogo': 'Botafogo RJ',
'Bourg-Peronnas': 'Bourg Peronnas',
'AFC Bournemouth': 'Bournemouth',
'Bradford City': 'Bradford',
'SK Brann': 'Brann',
'Eintracht Braunschweig': 'Braunschweig',
'Brighton and Hove Albion': 'Brighton',
'Bristol Rovers': 'Bristol Rvs',
'Istanbul Basaksehir': 'Buyuksehyr',
'Cambridge United': 'Cambridge',
'Cambuur Leeuwarden': 'Cambuur',
'Cardiff City': 'Cardiff',
'Carlisle United': 'Carlisle',
'FC Cartagena': 'Cartagena',
'Ceará': 'Ceara',
'Celta Vigo': 'Celta',
'Central Córdoba Santiago del Estero': 'Central Cordoba',
'Chambly Thelle FC': 'Chambly',
'Chapecoense AF': 'Chapecoense-SC',
'Sporting de Charleroi': 'Charleroi',
'Charlton Athletic': 'Charlton',
'Cheltenham Town': 'Cheltenham',
'Chiapas FC': 'Chiapas',
'Chievo Verona': 'Chievo',
'Chongqing Lifan': 'Chongqing Liangjiang Athletic',
'Clermont Foot': 'Clermont',
'Club América': 'Club America',
'Tijuana': 'Club Tijuana',
'Montreal Impact': 'Club de Foot Montreal',
'Colchester United': 'Colchester',
'Colon Santa Fe': 'Colon Santa FE',
'Coventry City': 'Coventry',
'Crewe Alexandra': 'Crewe',
'Dalian Aerbin': 'Dalian Yifang F.C.',
'Dalkurd FF': 'Dalkurd',
'SV Darmstadt 98': 'Darmstadt',
'Degerfors IF': 'Degerfors',
'ADO Den Haag': 'Den Haag',
'Derby County': 'Derby',
'Dijon FCO': 'Dijon',
'Djurgardens IF': 'Djurgarden',
'Doncaster Rovers': 'Doncaster',
'Borussia Dortmund': 'Dortmund',
'Dynamo Dresden': 'Dresden',
'MSV Duisburg': 'Duisburg',
'Dundee Utd': 'Dundee United',
'Dinamo Moscow': 'Dynamo Moscow',
'Eintracht Frankfurt': 'Ein Frankfurt',
'IF Elfsborg': 'Elfsborg',
'Erzurumspor': 'Erzurum BB',
'Espanyol': 'Espanol',
'Estoril Praia': 'Estoril',
'Estudiantes': 'Estudiantes L.P.',
'Exeter City': 'Exeter',
'Emmen': 'FC Emmen',
'FC Cologne': 'FC Koln',
'Anzhi Makhachkala': 'FK Anzi Makhackala',
'Krylia Sovetov': 'FK Krylya Sovetov Samara',
'Rostov': 'FK Rostov',
'Falkenbergs FF': 'Falkenbergs',
'SC Farense': 'Farense',
'Flamengo': 'Flamengo RJ',
'Fortuna Sittard': 'For Sittard',
'Forest Green Rovers': 'Forest Green',
'Fortuna Düsseldorf': 'Fortuna Dusseldorf',
'SC Freiburg': 'Freiburg',
'Gazisehir Gaziantep': 'Gaziantep',
'KAA Gent': 'Gent',
'Gimnasia La Plata': 'Gimnasia L.P.',
'Gimnástic Tarragona': 'Gimnastic',
'Girona FC': 'Girona',
'Goiás': 'Goias',
'IFK Goteborg': 'Goteborg',
'Goztepe': 'Goztep',
'De Graafschap': 'Graafschap',
'Grasshoppers Zürich': 'Grasshoppers',
'Grêmio': 'Gremio',
'SpVgg Greuther Fürth': 'Greuther Furth',
'Grimsby Town': 'Grimsby',
'FC Groningen': 'Groningen',
'Guadalajara': 'Guadalajara Chivas',
'Guangzhou RF': 'Guangzhou R&F',
'Guizhou Hengfeng Zhicheng': 'Guizhou Zhicheng',
'BK Hacken': 'Hacken',
'Halmstads BK': 'Halmstad',
'Hamburg SV': 'Hamburg',
'Hamilton Academical': 'Hamilton',
'Hannover 96': 'Hannover',
'Harrogate Town': 'Harrogate',
'Hebei China Fortune FC': 'Hebei',
'1. FC Heidenheim 1846': 'Heidenheim',
'Helsingborgs IF': 'Helsingborg',
'Henan Jianye': 'Henan Songshan Longmen',
'Hertha Berlin': 'Hertha',
'Hobro IK': 'Hobro',
'TSG Hoffenheim': 'Hoffenheim',
'Consadole Sapporo': 'Hokkaido Consadole Sapporo',
'AC Horsens': 'Horsens',
'Huddersfield Town': 'Huddersfield',
'SD Huesca': 'Huesca',
'Hull City': 'Hull',
'Huracán': 'Huracan',
'UD Ibiza': 'Ibiza',
'CA Independiente': 'Independiente',
'FC Ingolstadt 04': 'Ingolstadt',
'Internazionale': 'Inter',
'Inter Miami CF': 'Inter Miami',
'Ionikos FC': 'Ionikos',
'Ipswich Town': 'Ipswich',
'Jubilo Iwata': 'Iwata',
'Jiangsu Suning FC': 'Jiangsu Suning',
'Jonkopings Sodra IF': 'Jonkopings',
'FC Juárez': 'Juarez',
'1. FC Kaiserslautern': 'Kaiserslautern',
'Kalmar FF': 'Kalmar',
'Karabükspor': 'Karabukspor',
'Fatih Karagümrük': 'Karagumruk',
'Karlsruher SC': 'Karlsruhe',
'FC Khimki': 'Khimki',
'KV Kortrijk': 'Kortrijk',
'FC Krasnodar': 'Krasnodar',
'Kristiansund BK': 'Kristiansund',
'LASK Linz': 'LASK',
'Deportivo La Coruña': 'La Coruna',
'Larissa': 'Larisa',
'Lausanne Sports': 'Lausanne',
'Leeds United': 'Leeds',
'Leicester City': 'Leicester',
'Cultural Leonesa': 'Leonesa',
'Levadiakos': 'Levadeiakos',
'Bayer Leverkusen': 'Leverkusen',
'Lincoln City': 'Lincoln',
'Lobos de la BUAP': 'Lobos BUAP',
'KSC Lokeren': 'Lokeren',
'La Hoya Lorca': 'Lorca',
'FC Lugano': 'Lugano',
'Luton Town': 'Luton',
'FC Luzern': 'Luzern',
'Borussia Monchengladbach': "M'gladbach",
'1. FC Magdeburg': 'Magdeburg',
'Málaga': 'Malaga',
'Manchester City': 'Man City',
'Manchester United': 'Man United',
'Mansfield Town': 'Mansfield',
'SV Mattersburg': 'Mattersburg',
'Mazatlán FC': 'Mazatlan FC',
'KV Mechelen': 'Mechelen',
'FC Midtjylland': 'Midtjylland',
'AC Milan': 'Milan',
'Minnesota United FC': 'Minnesota United',
'AS Monaco': 'Monaco',
'Morelia': 'Monarcas',
'Mouscron-Peruwelz': 'Mouscron',
'NAC': 'NAC Breda',
'C.D. Nacional': 'Nacional',
'Nagoya Grampus Eight': 'Nagoya Grampus',
'AS Nancy Lorraine': 'Nancy',
'New York City FC': 'New York City',
"Newell's Old Boys": 'Newells Old Boys',
'NEC': 'Nijmegen',
'FK Nizhny Novgorod': 'Nizhny Novgorod',
'FC Nordsjaelland': 'Nordsjaelland',
'IFK Norrkoping': 'Norrkoping',
'Northampton Town': 'Northampton',
'Norwich City': 'Norwich',
'Nottingham Forest': "Nott'm Forest",
'1. FC Nürnberg': 'Nurnberg',
'Odd BK': 'Odd',
'Odense BK': 'Odense',
'Oldham Athletic': 'Oldham',
'Olimpo': 'Olimpo Bahia Blanca',
'Olympiacos': 'Olympiakos',
'KV Oostende': 'Oostende',
'Orebro SK': 'Orebro',
'Gazovik Orenburg': 'Orenburg',
'Orlando City SC': 'Orlando City',
'Orléans': 'Orleans',
'VfL Osnabruck': 'Osnabruck',
'Ã\x96stersunds FK': 'Ostersunds',
'OH Leuven': 'Oud-Heverlee Leuven',
'Real Oviedo': 'Oviedo',
'Oxford United': 'Oxford',
'PAOK Salonika': 'PAOK',
'SC Paderborn': 'Paderborn',
'Paraná': 'Parana',
'Paris Saint-Germain': 'Paris SG',
'Partick Thistle': 'Partick',
'Pau': 'Pau FC',
'US Pescara': 'Pescara',
'Peterborough United': 'Peterboro',
'Plymouth Argyle': 'Plymouth',
'SD Ponferradina': 'Ponferradina',
'Pordenone Calcio': 'Pordenone',
'FC Porto': 'Porto',
'Preston North End': 'Preston',
'Querétaro': 'Queretaro',
'US Quevilly': 'Quevilly Rouen',
'FK Volgograd': 'R. Volgograd',
'Red Star FC 93': 'Red Star',
'Jahn Regensburg': 'Regensburg',
'Stade Rennes': 'Rennes',
'SV Ried': 'Ried',
'Caykur Rizespor': 'Rizespor',
'Roda JC': 'Roda',
'AS Roma': 'Roma',
'Rotherham United': 'Rotherham',
'Energiya Khabarovsk': 'SKA Khabarovsk',
'CD Sabadell': 'Sabadell',
'Salford City': 'Salford',
'FC Salzburg': 'Salzburg',
'San Martin San Juan': 'San Martin S.J.',
'San Martin de Tucuman': 'San Martin T.',
'SV Sandhausen': 'Sandhausen',
'Racing Santander': 'Santander',
'São Paulo': 'Sao Paulo',
'Sarpsborg': 'Sarpsborg 08',
'Seattle Sounders FC': 'Seattle Sounders',
'RFC Seraing': 'Seraing',
'Vitoria Setubal': 'Setubal',
'Sevilla FC': 'Sevilla',
'Sevilla Atletico': 'Sevilla B',
'Shanghai Greenland': 'Shanghai Shenhua',
'Sheffield Wednesday': 'Sheffield Weds',
'Shenzhen FC': 'Shenzhen',
'Shrewsbury Town': 'Shrewsbury',
'FC Sion': 'Sion',
'IK Sirius': 'Sirius',
'Real Sociedad': 'Sociedad',
'Real Sociedad II': 'Sociedad B',
'Southend United': 'Southend',
'Braga': 'Sp Braga',
'Sporting Gijón': 'Sp Gijon',
'Sparta': 'Sparta Rotterdam',
'FC St. Pauli': 'St Pauli',
'St. Truidense': 'St Truiden',
'St Gallen': 'St. Gallen',
'Union Saint Gilloise': 'St. Gilloise',
'St. Pölten': 'St. Polten',
'Standard Liege': 'Standard',
'IK Start': 'Start',
'Stoke City': 'Stoke',
'SK Sturm Graz': 'Sturm Graz',
'VfB Stuttgart': 'Stuttgart',
'GIF Sundsvall': 'Sundsvall',
'Sutton United': 'Sutton',
'Swansea City': 'Swansea',
'Swindon Town': 'Swindon',
'Talleres de Córdoba': 'Talleres Cordoba',
'FC Tambov': 'Tambov',
'Tianjin Quanujian': 'Tianjin Quanjian',
'Tianjin Teda': 'Tianjin Tianhai',
'WSG Swarovski Wattens': 'Tirol',
'Tokushima Vortis': 'Tokushima',
'FC Tosno': 'Tosno',
'Tottenham Hotspur': 'Tottenham',
'Tranmere Rovers': 'Tranmere',
'Trelleborgs FF': 'Trelleborgs',
'FC Twente': 'Twente',
'Tigres UANL': 'U.A.N.L.- Tigres',
'FC Ufa': 'Ufa',
'1. FC Union Berlin': 'Union Berlin',
'Union Santa Fe': 'Union de Santa Fe',
'Urawa Red Diamonds': 'Urawa Reds',
'FC Utrecht': 'Utrecht',
'FC Vaduz': 'Vaduz',
'Real Valladolid': 'Valladolid',
'Rayo Vallecano': 'Vallecano',
'Varbergs BoIS FC': 'Varbergs',
'Vasco da Gama': 'Vasco',
'F.B.C Unione Venezia': 'Venezia',
'Viking FK': 'Viking',
'VÃ\xadtoria': 'Vitoria',
'FC Wacker Innsbruck': 'Wacker Innsbruck',
'SV Zulte Waregem': 'Waregem',
'SV Wehen Wiesbaden': 'Wehen',
'West Bromwich Albion': 'West Brom',
'West Ham United': 'West Ham',
'Wolfsberger AC': 'Wolfsburg',
'VfL Wolfsburg': 'Wolfsburg',
'Wuhan Zall': 'Wuhan FC',
'Würzburger Kickers': 'Wurzburger Kickers',
'Wycombe Wanderers': 'Wycombe',
'Neuchatel Xamax': 'Xamax',
'FC Xanthi': 'Xanthi',
'Matsumoto Yamaga FC': 'Yamaga',
'Yeovil Town': 'Yeovil',
'Real Zaragoza': 'Zaragoza',
'FC Zurich': 'Zurich',
'PEC Zwolle': 'Zwolle',
}
@classmethod
def _get_schema(cls):
return list(
set(FDSoccerDataLoader._get_schema() + FTESoccerDataLoader._get_schema())
)
@classmethod
def _get_outcomes(cls):
return OUTCOMES
@classmethod
@lru_cache
def _get_params(cls):
full_param_grid = ParameterGrid(
FDSoccerDataLoader.get_all_params().param_grid
+ FTESoccerDataLoader.get_all_params().param_grid
)
full_param_grid = (
pd.DataFrame(full_param_grid).drop_duplicates().to_dict('records')
)
return ParameterGrid(
[
{name: [val] for name, val in params.items()}
for params in full_param_grid
]
)
@staticmethod
def _check(dataloader):
dataloader._check_param_grid()
dataloader._check_schema()
dataloader._check_outcomes()
dataloader._check_drop_na_thres(None)
dataloader._check_odds_type(None)
dataloader._check_data()
return dataloader
@lru_cache
def _get_data(self):
fd_data = self._check(FDSoccerDataLoader(self.param_grid)).data_
fte_data = self._check(FTESoccerDataLoader(self.param_grid)).data_
for col in ('home_team', 'away_team'):
fte_data[col] = fte_data[col].apply(
lambda name: self._names_mapping.get(name, name)
)
data = pd.merge(fd_data.reset_index(), fte_data.reset_index())
return data