# coding: utf-8
# In[1]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed") # another bogus warning, see https://github.com/numpy/numpy/pull/432
import pandas as pd
import numpy as np
import IPython.display as dsp
import matplotlib.dates as mdates
from pyqstrat.pq_utils import *
from pyqstrat.plot import *
# In[3]:
def _sort_ohlcv(a):
l = ['o', 'h', 'l', 'c', 'v']
if a in l:
return l.index(a)
else:
return -1
[docs]class MarketData:
'''Used to store OHLCV bars. You must at least supply dates and close prices. All other fields are optional.
Attributes:
dates: A numpy datetime array with the datetime for each bar. Must be monotonically increasing.
c: A numpy float array with close prices for the bar.
o: A numpy float array with open prices
h: A numpy float array with high prices
l: A numpy float array with high prices
v: A numpy integer array with volume for the bar
'''
[docs] def __init__(self, dates, c, o = None, h = None, l = None, v = None):
'''Zeroes in o, h, l, c are set to nan'''
assert(len(dates) > 1)
assert(len(c) == len(dates))
assert(o is None or len(o) == len(dates))
assert(h is None or len(h) == len(dates))
assert(l is None or len(l) == len(dates))
assert(v is None or len(v) == len(dates))
if not np.all(np.diff(dates).astype(np.float) > 0): # check for monotonically increasing dates
raise Exception('marketdata dates must be unique monotonically increasing')
self.dates = dates
self.o = zero_to_nan(o)
self.h = zero_to_nan(h)
self.l = zero_to_nan(l)
self.c = zero_to_nan(c)
self.v = v
self._set_valid_rows()
def _set_valid_rows(self):
nans = np.any(np.isnan([self.o, self.h, self.l, self.c]), axis = 0)
self.valid_rows = ~nans
[docs] def valid_row(self, i):
'''Return True if the row with index i has no nans in it.'''
return self.valid_rows[i]
[docs] def resample(self, sampling_frequency, inplace = False):
'''
Downsample the OHLCV data into a new bar frequency
Args:
sampling_frequency: See sampling frequency in pandas
inplace: If set to False, don't modify this object, return a new object instead.
'''
if sampling_frequency is None: return self
df = self.df()
orig_columns = df.columns
df = df.resample(sampling_frequency).agg({'o': 'first', 'h': 'max', 'l': 'min', 'c': 'last', 'v' : 'sum'}).dropna(how = 'all')
if not inplace:
md = MarketData(self.dates, self.c, self.o, self.h, self.l, self.v)
else:
md = self
for col in ['o', 'h', 'l', 'c', 'v']:
if col in orig_columns: setattr(md, col, df[col].values)
md.dates = df.index.values
md._set_valid_rows()
return md
[docs] def errors(self, display = True):
'''Returns a dataframe indicating any highs that are lower than opens, closes, lows or lows that are higher than other columns
Also includes any ohlcv values that are negative
'''
df = self.df()
errors_list = []
if 'h' in df.columns:
bad_highs = df[(df.h < df.c) | (df.h < df.o)]
if len(bad_highs):
bad_highs.insert(len(df.columns), 'error', 'bad high')
errors_list.append(bad_highs)
if 'l' in df.columns:
bad_lows = df[(df.l > df.c) | (df.l > df.o)]
if len(bad_lows):
bad_lows.insert(len(df.columns), 'error', 'bad low')
errors_list.append(bad_lows)
neg_values_mask = (df.c < 0)
for col in ['o', 'h', 'l', 'c', 'v']:
if col in df.columns:
neg_values_mask |= (df[col] < 0)
neg_values = df[neg_values_mask]
if len(neg_values):
neg_values.insert(len(df.columns), 'error', 'negative values')
errors_list.append(neg_values)
if not len(errors_list): return None
df = pd.concat(errors_list)
if display: dsp.display(df)
return df
[docs] def warnings(self, warn_std = 10, display = True):
'''Returns a dataframe indicating any values where the bar over bar change is more than warn_std standard deviations.
Args:
warn_std: Number of standard deviations to use as a threshold (default 10)
display: Whether to print out the warning dataframe as well as returning it
'''
df = self.df()
warnings_list = []
for col in ['o', 'h', 'l', 'c']:
if col in df.columns:
data = df[col]
ret = np.abs(df[col].pct_change())
std = ret.std()
mask = ret > warn_std * std
df_tmp = df[mask]
if len(df_tmp):
double_mask = mask | mask.shift(-1) # Add the previous row so we know the two values computing a return
df_tmp = df[double_mask]
df_tmp.insert(len(df_tmp.columns), 'ret', ret[mask])
df_tmp.insert(len(df_tmp.columns), 'warning', '{} ret > {} std: {}'.format(col, warn_std, round(std, 6)))
warnings_list.append(df_tmp)
if not len(warnings_list): return None
df = pd.concat(warnings_list)
if display: dsp.display(df)
return df
[docs] def overview(self, display = True):
'''Returns a dataframe showing basic information about the data, including count, number and percent missing, min, max
Args:
display: Whether to print out the warning dataframe as well as returning it
'''
df = self.df().reset_index()
df_overview = pd.DataFrame({'count': len(df), 'num_missing' : df.isnull().sum(), 'pct_missing': df.isnull().sum() / len(df), 'min' : df.min(), 'max' : df.max()})
df_overview = df_overview.T
columns = sorted(list(df_overview.columns), key = _sort_ohlcv)
df_overview = df_overview[columns]
if display: dsp.display(df_overview)
return df_overview
[docs] def time_distribution(self, frequency = '15 minutes', display = True, plot = True, figsize = None):
'''
Return a dataframe with the time distribution of the bars
Args:
frequency: The width of each bin (default "15 minutes"). You can use hours or days as well.
display: Whether to display the data in addition to returning it.
plot: Whether to plot the data in addition to returning it.
figsize: If plot is set, optional figure size for the plot (default (20,8))
'''
group_col = None
n = int(frequency.split(' ')[0])
freq = frequency.split(' ')[1]
df = self.df().reset_index()
if freq == 'minutes' or freq == 'mins' or freq == 'min':
group_col = [df.date.dt.hour, df.date.dt.minute // n * n]
names = ['hour', 'minute']
elif freq == 'hours' or freq == 'hrs' or freq == 'hr':
group_col = [df.date.dt.weekday_name, df.date.dt.hour // n * n]
names = ['weekday', 'hour']
elif freq == 'weekdays' or freq == 'days' or freq == 'day':
group_col = df.date.dt.weekday_name // n * n
names = ['weekday']
else:
raise Exception(f'unknown time freq: {freq}')
count = df.groupby(group_col)['c'].count()
tdf = pd.DataFrame({'close_count': count, 'count_pct' : count / df.c.count()})[['close_count', 'count_pct']]
if 'v' in df.columns:
vsum = df.groupby(group_col)['v'].sum()
vdf = pd.DataFrame({'volume' : vsum, 'volume_pct' : vsum / df.v.sum()})[['volume', 'volume_pct']]
tdf = pd.concat([vdf, tdf], axis = 1)
tdf.index.names = names
if display:
dsp.display(tdf)
if plot:
if not figsize: figsize = (20, 8)
cols = ['close_count', 'volume'] if 'v' in df.columns else ['close_count']
if not has_display():
print('no display found, cannot plot time distribution')
return tdf
tdf[cols].plot(figsize = figsize, kind = 'bar', subplots = True, title = 'Time Distribution')
return tdf
[docs] def freq_str(self):
freq = infer_frequency(self.dates)
if freq < 1:
freq_str = f'{round(freq * 24. * 60, 2)} minutes'
else:
freq_str = f'{freq} days'
return freq_str
[docs] def describe(self, warn_std = 10, time_distribution_frequency = '15 min', print_time_distribution = False):
'''
Describe the bars. Shows an overview, errors and warnings for the bar data. This is a good function to use
before running any backtests on a set of bar data.
Args:
warn_std: See warning function
time_distribution_frequency: See time_distribution function
print_time_distribution: Whether to print the time distribution in addition to plotting it.
'''
print(f'Inferred Frequency: {self.freq_str()}')
self.overview()
print('Errors:')
self.errors()
print('Warnings:')
self.warnings(warn_std = warn_std)
print('Time distribution:')
self.time_distribution(display = print_time_distribution, frequency = time_distribution_frequency)
[docs] def is_ohlc(self):
'''
Returns True if we have all ohlc columns and none are empty
'''
return not (self.o is None or self.h is None or self.l is None or self.c is None)
[docs] def plot(self, figsize = (15,8), date_range = None, sampling_frequency = None, title = 'Price / Volume'):
'''
Plot a candlestick or line plot depending on whether we have ohlc data or just close prices
Args:
figsize: Size of the figure (default (15,8))
date_range: A tuple of strings or numpy datetimes for plotting a smaller sample of the data, e.g. ("2018-01-01", "2018-01-06")
sampling_frequency: Downsample before plotting. See pandas frequency strings for possible values.
title: Title of the graph, default "Price / Volume"
'''
date_range = strtup2date(date_range)
if self.is_ohlc():
data = OHLC('price', self.dates, self.o, self.h, self.l, self.c, self.v)
else:
data = TimeSeries('price', self.dates, self.c)
subplot = Subplot(data)
plot = Plot([subplot], figsize = figsize, date_range = date_range, sampling_frequency = sampling_frequency, title = title)
plot.draw()
[docs] def df(self, start_date = None, end_date = None):
df = pd.DataFrame({'date' : self.dates, 'c' : self.c}).set_index('date')
for tup in [('o', self.o), ('h', self.h), ('l', self.l), ('v', self.v)]:
if tup[1] is not None: df.insert(0, tup[0], tup[1])
if start_date: df = df[df.index.values >= start_date]
if end_date: df = df[df.index.values <= end_date]
return df
[docs]def roll_futures(md, date_func, condition_func, expiries = None, return_full_df = False):
'''Construct a continuous futures dataframe with one row per datetime given rolling logic
Args:
md: A dataframe containing the columns 'date', 'series', and any other market data, for example, ohlcv data. Date can contain time for sub-daily bars.
The series column must contain a different string name for each futures series, e.g. SEP2018, DEC2018, etc.
date_func: A function that takes the market data object as an input and returns a numpy array of booleans
True indicates that the future should be rolled on this date if the condition specified in condition_func is met.
This function can assume that we have all the columns in the original market data object plus the same columns suffixed with _next for the potential series
to roll over to.
condition_func: A function that takes the market data object as input and returns a numpy array of booleans.
True indicates that we should try to roll the future at that row.
expiries: An optional dataframe with 2 columns, 'series' and 'expiry'. This should have one row per future series indicating that future's expiry date.
If you don't pass this in, the function will assume that the expiry column is present in the original dataframe.
return_full_df: If set, will return the datframe without removing extra dates so you can use your own logic for rolling, including the _next columns and
the roll flag
Returns:
A pandas DataFrame with one row per date, which contains the columns in the original md DataFrame and the same columns suffixed with _next
representing the series we want to roll to. There is also a column called roll_flag which is set to True whenever
the date and roll condition functions are met.
>>> md = pd.DataFrame({'date' : np.concatenate((np.arange(np.datetime64('2018-03-11'), np.datetime64('2018-03-16')),
... np.arange(np.datetime64('2018-03-11'), np.datetime64('2018-03-16')))),
... 'c' : [10, 10.1, 10.2, 10.3, 10.4] + [10.35, 10.45, 10.55, 10.65, 10.75],
... 'v' : [200, 200, 150, 100, 100] + [100, 50, 200, 250, 300],
... 'series' : ['MAR2018'] * 5 + ['JUN2018'] * 5})[['date','series', 'c', 'v']]
>>> expiries = pd.Series(np.array(['2018-03-15', '2018-06-15'], dtype = 'M8[D]'), index = ['MAR2018', 'JUN2018'], name = "expiry")
>>> date_func = lambda md : md.expiry - md.date <= np.timedelta64(3, 'D')
>>> condition_func = lambda md : md.v_next > md.v
>>> df = roll_futures(md, date_func, condition_func, expiries)
>>> df[df.series == 'MAR2018'].date.max() == np.datetime64('2018-03-14')
True
>>> df[df.series == 'JUN2018'].date.max() == np.datetime64('2018-03-15')
True
'''
if 'date' not in md.columns or 'series' not in md.columns:
raise Exception('date or series not found in columns: {md.columns}')
if expiries is not None:
expiries = expiries.to_frame(name = 'expiry')
md = pd.merge(md, expiries, left_on = ['series'], right_index = True, how = 'left')
else:
if 'expiry' not in md.columns: raise Exception('expiry column must be present in market data if expiries argument is not specified')
expiries = md[['series', 'expiry']].drop_duplicates().sort_values(by = 'expiry').set_index('s')
expiries = pd.merge(expiries, expiries.shift(-1), left_index = True, right_index = True, how = 'left', suffixes = ['', '_next'])
orig_cols = [col for col in md.columns if col not in ['date']]
md1 = pd.merge(md, expiries[['expiry', 'expiry_next']], on = ['expiry'], how = 'left')
md = pd.merge(md1, md, left_on = ['date', 'expiry_next'], right_on = ['date', 'expiry'], how = 'left', suffixes = ['', '_next'])
md.sort_values(by = ['expiry', 'date'], inplace = True)
roll_flag = date_func(md) & condition_func(md)
df_roll = pd.DataFrame({'series' : md.series, 'date' : md.date, 'roll_flag' : roll_flag})
df_roll = df_roll[df_roll.roll_flag].groupby('series', as_index = False).first()
md = pd.merge(md, df_roll, on = ['series', 'date'], how = 'left')
md.roll_flag = md.roll_flag.fillna(False)
cols = ['date'] + orig_cols + [col + '_next' for col in orig_cols] + ['roll_flag']
md = md[cols]
if return_full_df: return md
df_list = []
for series, g in md.groupby('expiry'):
roll_flag = g.roll_flag
true_values = roll_flag[roll_flag]
if len(true_values):
first_true_index = true_values.index[0]
roll_flag = roll_flag[first_true_index:]
false_after_true_values = roll_flag[~roll_flag]
if len(false_after_true_values):
first_false_after_true_idx = false_after_true_values.index[0]
g = g.loc[:first_false_after_true_idx]
df_list.append(g)
full_df = pd.concat(df_list)
full_df = full_df.sort_values(by = ['expiry', 'date']).drop_duplicates(subset=['date'])
return full_df
[docs]def test_marketdata():
from datetime import datetime, timedelta
np.random.seed(0)
dates = np.arange(datetime(2018, 1, 1, 9, 0, 0), datetime(2018, 3, 1, 16, 0, 0), timedelta(minutes = 5))
dates = np.array([dt for dt in dates.astype(object) if dt.hour >= 9 and dt.hour <= 16]).astype('M8[m]')
rets = np.random.normal(size = len(dates)) / 1000
c_0 = 100
c = np.round(c_0 * np.cumprod(1 + rets), 2)
l = np.round(c * (1. - np.abs(np.random.random(size = len(dates)) / 1000.)), 2)
h = np.round(c * (1. + np.abs(np.random.random(size = len(dates)) / 1000.)), 2)
o = np.round(l + (h - l) * np.random.random(size = len(dates)), 2)
v = np.abs(np.round(np.random.normal(size = len(dates)) * 1000))
c[18] = np.nan
l[85] = 1000
md = MarketData(dates, c, o, h, l, v)
md.describe()
md.plot(date_range = ('2018-01-02', '2018-01-02 12:00'))
if __name__ == "__main__":
test_marketdata()