# $$_ Lines starting with # $$_* autogenerated by jup_mini. Do not modify these
# $$_code
# $$_ %%checkall
import h5py
import string
import os
import numpy as np
import pandas as pd
import datetime
from typing import List, Dict, Tuple, Any
from pyqstrat.pq_utils import get_temp_dir
[docs]def np_arrays_to_hdf5(data: List[Tuple[str, np.ndarray]],
filename: str,
key: str,
dtypes: Dict[str, str] = None,
optimize_vlen_str: bool = True,
compression_args: Dict[Any, Any] = None) -> None:
'''
Write a list of numpy arrays to hdf5
Args:
data: List of numpy one dimensional arrays along with the name of the array
filename: filename of the hdf5 file
key: group and or / subgroups to write to. For example, "g1/g2" will write to the subgrp g2 within the grp g1
dtypes: dict used to override datatype for a column. For example, {"col1": "f4"} will write a 4 byte float array for col1
optimize_vlen_str: if set, for every variable length string array, i.e dtype = 'O', we try to find the maximum string length
and if it is < 100, we write out fixed length strings instead of variable length. This is much faster to read and process
compression_args: if you want to compress the hdf5 file. You can use the hdf5plugin module and arguments such as hdf5plugin.Blosc()
'''
if not len(data): return
tmp_key = key + '_tmp'
if compression_args is None:
compression_args = {}
with h5py.File(filename, 'a') as f:
if tmp_key in f: del f[tmp_key]
grp = f.create_group(tmp_key)
for colname, array in data:
if dtypes is not None and colname in dtypes:
_dtype = dtypes[colname]
dtype = np.dtype(_dtype)
if dtype.kind == 'M': # datetime
dtype = h5py.opaque_dtype(dtype)
array = array.astype(dtype)
else:
dtype = array.dtype
if dtype.kind == 'O': # strings
if optimize_vlen_str:
max_len = len(max(array, key=len))
if max_len < 100:
dtype = np.dtype(f'S{max_len}')
array = array.astype(dtype)
else:
dtype = h5py.string_dtype(encoding='utf-8')
else:
dtype = h5py.string_dtype(encoding='utf-8')
elif dtype.kind == 'M': # datetime
dtype = h5py.opaque_dtype(dtype)
array = array.astype(dtype)
if colname in grp:
del grp[colname]
grp.create_dataset(name=colname, data=array, shape=[len(array)], dtype=dtype, **compression_args)
grp.attrs['type'] = 'dataframe'
grp.attrs['timestamp'] = str(datetime.datetime.now())
grp.attrs['rows'] = len(array)
grp.attrs['columns'] = ','.join([tup[0] for tup in data])
if key in f:
del f[key]
f.move(tmp_key, key)
f.flush()
[docs]def hdf5_to_np_arrays(filename: str, key: str) -> List[Tuple[str, np.ndarray]]:
'''
Read a list of numpy arrays previously written out by np_arrays_to_hdf5
Args:
filename: path of the hdf5 file to read
key: group and or / subgroups to read from. For example, "g1/g2" will read from the subgrp g2 within the grp g1
Return:
a list of numpy arrays along with their names
'''
ret: List[Tuple[str, np.ndarray]] = []
with h5py.File(filename, 'r') as f:
assert key in f, f'{key} not found in {filename}'
grp = f[key]
assert 'type' in grp.attrs and grp.attrs['type'] == 'dataframe', f'{key} not a dataframe'
columns = grp.attrs['columns'].split(',')
for col in columns:
array = grp[col][:]
if array.dtype.kind == 'S':
# decode bytes to numpy unicode
dtype = f'U{array.dtype.itemsize}'
array = array.astype(dtype)
ret.append((col, array))
return ret
[docs]def df_to_hdf5(df: pd.DataFrame, filename: str, key: str, dtypes: Dict[str, str] = None, optimize_vlen_str=True) -> None:
'''
Write out a pandas dataframe to hdf5 using the np_arrays_to_hdf5 function
'''
arrays = []
for column in df.columns:
arrays.append((column, df[column].values))
np_arrays_to_hdf5(arrays, filename, key, dtypes, optimize_vlen_str)
[docs]def hdf5_to_df(filename: str, key: str) -> pd.DataFrame:
'''
Read a pandas dataframe previously written out using df_to_hdf5 or np_arrays_to_hdf5
'''
arrays = hdf5_to_np_arrays(filename, key)
array_dict = {name: array for name, array in arrays}
return pd.DataFrame(array_dict)
[docs]def test_hdf5_to_df():
size = int(100)
a = np.random.randint(0, 10000, size)
b = a * 1.1
letters = np.random.choice(list(string.ascii_letters), (size, 5))
c = np.empty(size, dtype='O')
for i, row in enumerate(letters):
c[i] = ''.join(row)
d = (a * 1000).astype('M8[m]')
temp_dir = get_temp_dir()
# os.remove(f'{temp_dir}/test.hdf5')
np_arrays_to_hdf5([("b", b), ("a", a), ("c", c), ("d", d)], f'{temp_dir}/test.hdf5', 'key1/key2')
file_size = os.path.getsize(f'{temp_dir}/test.hdf5')
print(f"file size: {file_size / 1e3:.0f} KB")
def read():
'''
for performance testing using timeit
'''
with h5py.File('test.hdf5', 'r') as f:
_ = f['key1/key2/a'][:]
_ = f['key1/key2/b'][:]
_ = f['key1/key2/c'][:]
_ = f['key1/key2/d'][:]
df_in = pd.DataFrame(dict(a=a, b=b, c=c, d=d))
df_to_hdf5(df_in, f'{temp_dir}/test.hdf5', 'key1/key2', dtypes={'d': 'M8[m]'})
df_out = hdf5_to_df(f'{temp_dir}/test.hdf5', 'key1/key2')
from pandas.testing import assert_frame_equal
assert_frame_equal(df_in, df_out)
if __name__ == '__main__':
test_hdf5_to_df()
# $$_end_code