Source code for pyqstrat.pq_io

# $$_ Lines starting with # $$_* autogenerated by jup_mini. Do not modify these
# $$_code
# $$_ %%checkall
import h5py
import string
import os
import numpy as np
import pandas as pd
import datetime
from typing import List, Dict, Tuple, Any
from pyqstrat.pq_utils import get_temp_dir


[docs]def np_arrays_to_hdf5(data: List[Tuple[str, np.ndarray]], filename: str, key: str, dtypes: Dict[str, str] = None, optimize_vlen_str: bool = True, compression_args: Dict[Any, Any] = None) -> None: ''' Write a list of numpy arrays to hdf5 Args: data: List of numpy one dimensional arrays along with the name of the array filename: filename of the hdf5 file key: group and or / subgroups to write to. For example, "g1/g2" will write to the subgrp g2 within the grp g1 dtypes: dict used to override datatype for a column. For example, {"col1": "f4"} will write a 4 byte float array for col1 optimize_vlen_str: if set, for every variable length string array, i.e dtype = 'O', we try to find the maximum string length and if it is < 100, we write out fixed length strings instead of variable length. This is much faster to read and process compression_args: if you want to compress the hdf5 file. You can use the hdf5plugin module and arguments such as hdf5plugin.Blosc() ''' if not len(data): return tmp_key = key + '_tmp' if compression_args is None: compression_args = {} with h5py.File(filename, 'a') as f: if tmp_key in f: del f[tmp_key] grp = f.create_group(tmp_key) for colname, array in data: if dtypes is not None and colname in dtypes: _dtype = dtypes[colname] dtype = np.dtype(_dtype) if dtype.kind == 'M': # datetime dtype = h5py.opaque_dtype(dtype) array = array.astype(dtype) else: dtype = array.dtype if dtype.kind == 'O': # strings if optimize_vlen_str: max_len = len(max(array, key=len)) if max_len < 100: dtype = np.dtype(f'S{max_len}') array = array.astype(dtype) else: dtype = h5py.string_dtype(encoding='utf-8') else: dtype = h5py.string_dtype(encoding='utf-8') elif dtype.kind == 'M': # datetime dtype = h5py.opaque_dtype(dtype) array = array.astype(dtype) if colname in grp: del grp[colname] grp.create_dataset(name=colname, data=array, shape=[len(array)], dtype=dtype, **compression_args) grp.attrs['type'] = 'dataframe' grp.attrs['timestamp'] = str(datetime.datetime.now()) grp.attrs['rows'] = len(array) grp.attrs['columns'] = ','.join([tup[0] for tup in data]) if key in f: del f[key] f.move(tmp_key, key) f.flush()
[docs]def hdf5_to_np_arrays(filename: str, key: str) -> List[Tuple[str, np.ndarray]]: ''' Read a list of numpy arrays previously written out by np_arrays_to_hdf5 Args: filename: path of the hdf5 file to read key: group and or / subgroups to read from. For example, "g1/g2" will read from the subgrp g2 within the grp g1 Return: a list of numpy arrays along with their names ''' ret: List[Tuple[str, np.ndarray]] = [] with h5py.File(filename, 'r') as f: assert key in f, f'{key} not found in {filename}' grp = f[key] assert 'type' in grp.attrs and grp.attrs['type'] == 'dataframe', f'{key} not a dataframe' columns = grp.attrs['columns'].split(',') for col in columns: array = grp[col][:] if array.dtype.kind == 'S': # decode bytes to numpy unicode dtype = f'U{array.dtype.itemsize}' array = array.astype(dtype) ret.append((col, array)) return ret
[docs]def df_to_hdf5(df: pd.DataFrame, filename: str, key: str, dtypes: Dict[str, str] = None, optimize_vlen_str=True) -> None: ''' Write out a pandas dataframe to hdf5 using the np_arrays_to_hdf5 function ''' arrays = [] for column in df.columns: arrays.append((column, df[column].values)) np_arrays_to_hdf5(arrays, filename, key, dtypes, optimize_vlen_str)
[docs]def hdf5_to_df(filename: str, key: str) -> pd.DataFrame: ''' Read a pandas dataframe previously written out using df_to_hdf5 or np_arrays_to_hdf5 ''' arrays = hdf5_to_np_arrays(filename, key) array_dict = {name: array for name, array in arrays} return pd.DataFrame(array_dict)
[docs]def test_hdf5_to_df(): size = int(100) a = np.random.randint(0, 10000, size) b = a * 1.1 letters = np.random.choice(list(string.ascii_letters), (size, 5)) c = np.empty(size, dtype='O') for i, row in enumerate(letters): c[i] = ''.join(row) d = (a * 1000).astype('M8[m]') temp_dir = get_temp_dir() # os.remove(f'{temp_dir}/test.hdf5') np_arrays_to_hdf5([("b", b), ("a", a), ("c", c), ("d", d)], f'{temp_dir}/test.hdf5', 'key1/key2') file_size = os.path.getsize(f'{temp_dir}/test.hdf5') print(f"file size: {file_size / 1e3:.0f} KB") def read(): ''' for performance testing using timeit ''' with h5py.File('test.hdf5', 'r') as f: _ = f['key1/key2/a'][:] _ = f['key1/key2/b'][:] _ = f['key1/key2/c'][:] _ = f['key1/key2/d'][:] df_in = pd.DataFrame(dict(a=a, b=b, c=c, d=d)) df_to_hdf5(df_in, f'{temp_dir}/test.hdf5', 'key1/key2', dtypes={'d': 'M8[m]'}) df_out = hdf5_to_df(f'{temp_dir}/test.hdf5', 'key1/key2') from pandas.testing import assert_frame_equal assert_frame_equal(df_in, df_out)
if __name__ == '__main__': test_hdf5_to_df() # $$_end_code