Source code for utool.util_io

# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
import six
from six.moves import cPickle as pickle
from utool import util_path
from utool import util_inject
from os.path import splitext, basename, exists
try:
    import lockfile
    HAVE_LOCKFILE = True
except ImportError:
    HAVE_LOCKFILE = False
try:
    import numpy as np
    HAS_NUMPY = True
except ImportError as ex:
    HAS_NUMPY = False
try:
    import h5py
    HAS_H5PY = True
except ImportError:
    HAS_H5PY = False

print, rrr, profile = util_inject.inject2(__name__, '[io]')


__PRINT_IO__ = True
__PRINT_WRITES__ = __PRINT_IO__
__PRINT_READS__  =  __PRINT_IO__
__FORCE_PRINT_READS__ = False
__FORCE_PRINT_WRITES__ = False
__READ_TAIL_N__ = 3
#__FORCE_PRINT_READS__ = True
#__FORCE_PRINT_WRITES__ = True


[docs]def load_data(fpath, **kwargs): """ More generic interface to load data """ ext = splitext(fpath)[1] if ext in ['.pickle', '.cPkl', '.pkl']: return load_cPkl(fpath, **kwargs) elif ext in ['.hdf5']: return load_hdf5(fpath, **kwargs) elif ext in ['.txt']: return load_text(fpath, **kwargs) elif HAS_NUMPY and ext in ['.npz', '.npy']: return load_numpy(fpath, **kwargs) else: assert False, 'unknown ext=%r for fpath=%r' % (ext, fpath)
[docs]def save_data(fpath, data, **kwargs): """ More generic interface to write data """ ext = splitext(fpath)[1] if ext in ['.pickle', '.cPkl', '.pkl']: return save_cPkl(fpath, data, **kwargs) elif ext in ['.hdf5']: return save_hdf5(fpath, data, **kwargs) elif ext in ['.txt']: return save_text(fpath, **kwargs) elif HAS_NUMPY and ext in ['.npz', '.npy']: # TODO save_numpy return np.save(fpath, data, **kwargs) else: assert False, 'unknown ext=%r for fpath=%r' % (ext, fpath)
[docs]def write_to(fpath, to_write, aslines=False, verbose=None, onlyifdiff=False, mode='w', n=2): """ Writes text to a file. Automatically encodes text as utf8. Args: fpath (str): file path to_write (str): text to write (must be unicode text) aslines (bool): if True to_write is assumed to be a list of lines verbose (bool): verbosity flag onlyifdiff (bool): only writes if needed! checks hash of to_write vs the hash of the contents of fpath mode (unicode): (default = u'w') n (int): (default = 2) CommandLine: python -m utool.util_io --exec-write_to --show Example: >>> # DISABLE_DOCTEST >>> from utool.util_io import * # NOQA >>> import utool as ut >>> fpath = ut.unixjoin(ut.get_app_resource_dir('utool'), 'testwrite.txt') >>> ut.delete(fpath) >>> to_write = 'utf-8 symbols Δ, Й, ק, م, ๗, あ, 叶, 葉, and 말.' >>> aslines = False >>> verbose = True >>> onlyifdiff = False >>> mode = u'w' >>> n = 2 >>> write_to(fpath, to_write, aslines, verbose, onlyifdiff, mode, n) >>> read_ = ut.read_from(fpath) >>> print('read_ = ' + read_) >>> print('to_write = ' + to_write) >>> assert read_ == to_write """ if onlyifdiff: import utool as ut if ut.hashstr(read_from(fpath)) == ut.hashstr(to_write): print('[util_io] * no difference') return if verbose or (verbose is None and __PRINT_WRITES__) or __FORCE_PRINT_WRITES__: print('[util_io] * Writing to text file: %r ' % util_path.tail(fpath, n=n)) backup = False and exists(fpath) if backup: util_path.copy(fpath, fpath + '.backup') with open(fpath, mode) as file_: if aslines: file_.writelines(to_write) else: if six.PY2 and isinstance(to_write, unicode): to_write = to_write.encode('utf8') try: file_.write(to_write) except UnicodeEncodeError as ex: start = max(ex.args[2] - 10, 0) end = ex.args[3] + 10 context = to_write[start:end] print(repr(context)) print(context) from utool import util_dbg util_dbg.printex(ex, keys=[(type, 'to_write')]) file_.close() if backup: # restore util_path.copy(fpath + '.backup', fpath) # import utool # utool.embed() raise
[docs]def read_from(fpath, verbose=None, aslines=False, strict=True, n=None, errors='replace'): """ Reads text from a file. Automatically returns utf8. Args: fpath (str): file path aslines (bool): if True returns list of lines verbose (bool): verbosity flag Returns: str: text from fpath (this is unicode) """ if n is None: n = __READ_TAIL_N__ if verbose or (verbose is None and __PRINT_READS__) or __FORCE_PRINT_READS__: print('[util_io] * Reading text file: %r ' % util_path.tail(fpath, n=n)) try: if not util_path.checkpath(fpath, verbose=verbose, n=n): raise IOError('[io] * FILE DOES NOT EXIST!') #with open(fpath, 'r') as file_: with open(fpath, 'rb') as file_: if aslines: #text = file_.readlines() if six.PY2: text = [line.decode('utf8', errors=errors) for line in file_.readlines()] else: text = file_.readlines() else: # text = file_.read() if six.PY2: text = file_.read().decode('utf8', errors=errors) else: text = file_.read() return text except IOError as ex: from utool import util_dbg if verbose or strict: util_dbg.printex(ex, ' * Error reading fpath=%r' % util_path.tail(fpath, n=n), '[io]') if strict: raise
[docs]def read_lines_from(fpath, num_lines=None, verbose=None, n=3): with open(fpath, 'r') as file_: line_list = [] #for lineno, line in enumerate(file_.readline()): for lineno, line in enumerate(file_): line_list.append(line) if num_lines is not None and lineno > num_lines: break return line_list # aliases
readfrom = read_from writeto = write_to save_text = write_to load_text = read_from
[docs]def save_cPkl(fpath, data, verbose=None, n=2): """ Saves data to a pickled file with optional verbosity """ if verbose or (verbose is None and __PRINT_WRITES__) or __FORCE_PRINT_WRITES__: print('[util_io] * save_cPkl(%r, data)' % (util_path.tail(fpath, n=n),)) with open(fpath, 'wb') as file_: pickle.dump(data, file_, pickle.HIGHEST_PROTOCOL)
[docs]def load_cPkl(fpath, verbose=None, n=2): """ Loads a pickled file with optional verbosity """ if verbose or (verbose is None and __PRINT_READS__) or __FORCE_PRINT_READS__: print('[util_io] * load_cPkl(%r)' % (util_path.tail(fpath, n=n),)) with open(fpath, 'rb') as file_: data = pickle.load(file_) return data
[docs]def lock_and_load_cPkl(fpath, verbose=False): with lockfile.LockFile(fpath + '.lock'): return load_cPkl(fpath, verbose)
[docs]def lock_and_save_cPkl(fpath, data, verbose=False): with lockfile.LockFile(fpath + '.lock'): return save_cPkl(fpath, data, verbose)
[docs]def save_hdf5(fpath, data, verbose=False, compression='lzf'): r""" Restricted save of data using hdf5. Can only save ndarrays and dicts of ndarrays. Args: fpath (str): data (ndarray): compression (str): DEFLATE/GZIP - standard LZF - fast SHUFFLE - compression ratio FLETCHER32 - error detection Scale-offset - integer / float scaling and truncation SZIP - fast and patented CommandLine: python -m utool.util_io --test-save_hdf5 References: http://docs.h5py.org/en/latest/quick.html http://docs.h5py.org/en/latest/mpi.html Example: >>> # ENABLE_IF HAS_H5PY >>> from utool.util_io import * # NOQA >>> import numpy as np >>> import utool as ut >>> # build test data >>> rng = np.random.RandomState(0) >>> data = (rng.rand(100000, 128) * 255).astype(np.uint8).copy() >>> verbose = True >>> fpath = ut.unixjoin(ut.ensure_app_resource_dir('utool'), 'myfile.hdf5') >>> compression = 'lzf' >>> # execute function >>> ut.delete(fpath) >>> save_hdf5(fpath, data, verbose, compression) >>> data2 = load_hdf5(fpath, verbose) >>> assert data is not data2 >>> assert np.all(data == data2) >>> assert ut.delete(fpath) Timeit: >>> # cPkl / numpy seems to be faster with this initial implementation >>> import utool as ut >>> data = (rng.rand(1000000, 128) * 255).astype(np.uint8).copy() >>> print(ut.get_object_size_str(data)) >>> del data >>> setup = ut.codeblock( >>> ''' import numpy as np import utool as ut rng = np.random.RandomState(0) fpath = ut.unixjoin(ut.ensure_app_resource_dir('utool'), 'io_test_data') data = (rng.rand(1000000, 128) * 255).astype(np.uint8).copy() #print(ut.get_object_size_str(data)) ''') >>> # Test load time >>> stmt_list1 = ut.codeblock( >>> ''' ut.save_hdf5(fpath + '.hdf5', data, verbose=False, compression='gzip') ut.save_hdf5(fpath + '.hdf5', data, verbose=False, compression='lzf') ut.save_cPkl(fpath + '.cPkl', data, verbose=False) ut.save_numpy(fpath + '.npy', data, verbose=False) ut.save_pytables(fpath + '.tables', data, verbose=False) ''').split('\n') >>> ut.util_dev.timeit_compare(stmt_list1, setup, int(10)) >>> # Test save time >>> stmt_list2 = ut.codeblock( >>> ''' ut.load_hdf5(fpath + '.hdf5', verbose=False) ut.load_cPkl(fpath + '.cPkl', verbose=False) ut.load_numpy(fpath + '.npy', verbose=False) ut.load_pytables(fpath + '.tables', verbose=False) ''').split('\n') >>> ut.util_dev.timeit_compare(stmt_list2, setup, int(10)) >>> print('finished timeing') +---------------- | TIMEIT COMPARE +---------------- | iterations = 10 | Input: | | num | stmt | | 0 | u"ut.save_hdf5(fpath + '.hdf5', data, verbose=False, compression='gzip')" | | 1 | u"ut.save_hdf5(fpath + '.hdf5', data, verbose=False, compression='lzf')" | | 2 | u"ut.save_cPkl(fpath + '.cPkl', data, verbose=False)" | | 3 | u"ut.save_numpy(fpath + '.npy', data, verbose=False)" | | 4 | u"ut.save_pytables(fpath + '.tables', data, verbose=False)" ... | Output: | * PASSED: each statement produced the same result | | num | total time | per loop | stmt | | 0 | 0.03 ks | 3.15 s | ut.save_hdf5(fpath + '.hdf5', data, verbose=False, compression='gzip') | | 1 | 0.01 ks | 1.25 s | ut.save_hdf5(fpath + '.hdf5', data, verbose=False, compression='lzf') | | 2 | 5.30 s | 0.53 s | ut.save_cPkl(fpath + '.cPkl', data, verbose=False) | | 3 | 4.97 s | 0.50 s | ut.save_numpy(fpath + '.npy', data, verbose=False) | | 4 | 9.23 s | 0.92 s | ut.save_pytables(fpath + '.tables', data, verbose=False) L_________________ +---------------- | TIMEIT COMPARE +---------------- | iterations = 10 | Input: | | num | stmt | | 0 | u"ut.load_hdf5(fpath + '.hdf5', verbose=False)" | | 1 | u"ut.load_cPkl(fpath + '.cPkl', verbose=False)" | | 2 | u"ut.load_numpy(fpath + '.npy', verbose=False)" | | 3 | u"ut.load_pytables(fpath + '.tables', verbose=False)" ... | Output: | * PASSED: each statement produced the same result | | num | total time | per loop | stmt | | 0 | 2.39 s | 0.24 s | ut.load_hdf5(fpath + '.hdf5', verbose=False) | | 1 | 0.39 s | 0.04 s | ut.load_cPkl(fpath + '.cPkl', verbose=False) | | 2 | 0.19 s | 0.02 s | ut.load_numpy(fpath + '.npy', verbose=False) | | 3 | 0.33 s | 0.03 s | ut.load_pytables(fpath + '.tables', verbose=False) L_________________ Ignore: %timeit save_hdf5(fpath, data, verbose=False, compression='gzip') %timeit save_hdf5(fpath, data, verbose=False, compression='lzf') %timeit save_cPkl(fpath + '.cPkl', data, verbose=False) %timeit save_pytables(fpath + '.tables', data, verbose=False) 1 loops, best of 3: 258 ms per loop 10 loops, best of 3: 111 ms per loop 10 loops, best of 3: 53.1 ms per loop 10 loops, best of 3: 96.5 ms per loop save_hdf5(fpath, data, verbose=False, compression='gzip') %timeit load_hdf5(fpath, verbose=False) save_hdf5(fpath, data, verbose=False, compression='lzf') %timeit load_hdf5(fpath, verbose=False) %timeit load_cPkl(fpath + '.cPkl', verbose=False) %timeit load_pytables(fpath + '.tables', verbose=False) 100 loops, best of 3: 19.4 ms per loop 100 loops, best of 3: 14.4 ms per loop 100 loops, best of 3: 3.92 ms per loop 100 loops, best of 3: 6.22 ms per loop Notes: pip install mpi4py """ if verbose or (verbose is None and __PRINT_WRITES__) or __FORCE_PRINT_WRITES__: print('[util_io] * save_hdf5(%r, data)' % (util_path.tail(fpath),)) if verbose > 1: if isinstance(data, dict): print('[util_io] ... shapes=%r' % ([val.shape for val in data.values()],)) else: print('[util_io] ... shape=%r' % (data.shape,)) chunks = True # True enables auto-chunking fname = basename(fpath) # check for parallel hdf5 #have_mpi = h5py.h5.get_config().mpi #if have_mpi: # import mpi4py # h5kw = dict(driver='mpio', comm=mpi4py.MPI.COMM_WORLD) # # cant use compression with mpi # #ValueError: Unable to create dataset (Parallel i/o does not support filters yet) #else: h5kw = {} if isinstance(data, dict): assert all([ isinstance(vals, np.ndarray) for vals in six.itervalues(data) ]), ('can only save dicts as ndarrays') # file_ = h5py.File(fpath, 'w', **h5kw) with h5py.File(fpath, mode='w', **h5kw) as file_: grp = file_.create_group(fname) for key, val in six.iteritems(data): dset = grp.create_dataset( key, val.shape, val.dtype, chunks=chunks, compression=compression) dset[...] = val else: assert isinstance(data, np.ndarray) shape = data.shape dtype = data.dtype #if verbose or (verbose is None and __PRINT_WRITES__): # print('[util_io] * save_hdf5(%r, data)' % (util_path.tail(fpath),)) # file_ = h5py.File(fpath, 'w', **h5kw) with h5py.File(fpath, mode='w', **h5kw) as file_: #file_.create_dataset( # fname, shape, dtype, chunks=chunks, compression=compression, # data=data) dset = file_.create_dataset( fname, shape, dtype, chunks=chunks, compression=compression) dset[...] = data
[docs]def load_hdf5(fpath, verbose=False): fname = basename(fpath) #file_ = h5py.File(fpath, 'r') #file_.values() #file_.keys() if verbose or (verbose is None and __PRINT_READS__) or __FORCE_PRINT_READS__: print('[util_io] * load_hdf5(%r)' % (util_path.tail(fpath),)) with h5py.File(fpath, 'r') as file_: value = file_[fname] if isinstance(value, h5py.Group): grp = value data = {} for key, dset in six.iteritems(grp): shape = dset.shape dtype = dset.dtype subdata = np.empty(shape, dtype=dtype) dset.read_direct(subdata) data[key] = subdata pass elif isinstance(value, h5py.Dataset): dset = value shape = dset.shape dtype = dset.dtype data = np.empty(shape, dtype=dtype) dset.read_direct(data) else: assert False return data
[docs]def save_pytables(fpath, data, verbose=False): """ sudo pip install numexpr sudo pip install tables References: https://pytables.github.io/cookbook/py2exe_howto.html https://gist.github.com/andrewgiessel/7515520 http://stackoverflow.com/questions/8843062/python-how-to-store-a-numpy-multidimensional-array-in-pytables http://pytables.github.io/usersguide/tutorials.html#creating-new-array-objects Example: >>> # DISABLE_DOCTEST >>> from utool.util_io import * # NOQA >>> import numpy as np >>> import utool as ut >>> # build test data >>> verbose = True >>> fpath = 'myfile.pytables.hdf5' >>> np.random.seed(0) >>> compression = 'gzip' >>> data = (np.random.rand(100000, 128) * 255).astype(np.uint8).copy() >>> # execute function >>> ut.delete(fpath) >>> save_pytables(fpath, data, verbose) >>> data2 = load_pytables(fpath, verbose) >>> assert data is not data2 >>> assert np.all(data == data2) >>> assert ut.delete(fpath) """ import tables #from os.path import basename #fname = basename(fpath) #shape = data.shape #dtype = data.dtype #file_ = tables.open_file(fpath) if verbose or (verbose is None and __PRINT_WRITES__) or __FORCE_PRINT_WRITES__: print('[util_io] * save_pytables(%r, data)' % (util_path.tail(fpath),)) with tables.open_file(fpath, 'w') as file_: atom = tables.Atom.from_dtype(data.dtype) filters = tables.Filters(complib='blosc', complevel=5) dset = file_.createCArray(file_.root, 'data', atom, data.shape, filters=filters) # save w/o compressive filter #dset = file_.createCArray(file_.root, 'all_data', atom, all_data.shape) dset[:] = data
[docs]def load_pytables(fpath, verbose=False): import tables #from os.path import basename #fname = basename(fpath) #file_ = tables.open_file(fpath) if verbose or (verbose is None and __PRINT_READS__) or __FORCE_PRINT_READS__: print('[util_io] * load_pytables(%r, data)' % (util_path.tail(fpath),)) with tables.open_file(fpath, 'r') as file_: data = file_.root.data.read() return data
[docs]def load_numpy(fpath, mmap_mode=None, verbose=None): if verbose or (verbose is None and __PRINT_READS__) or __FORCE_PRINT_READS__: print('[util_io] * load_numpy(%r)' % util_path.tail(fpath)) return np.load(fpath, mmap_mode=mmap_mode)
[docs]def save_numpy(fpath, data, verbose=None): if verbose or (verbose is None and __PRINT_WRITES__) or __FORCE_PRINT_WRITES__: print('[util_io] * save_numpy(%r, data)' % util_path.tail(fpath)) return np.save(fpath, data) #def save_capnp(fpath, data, verbose=False): # r""" # Refernces: # http://jparyani.github.io/pycapnp/quickstart.html#dictionaries # """ # import capnp # if verbose or __PRINT_WRITES__: # print('[util_io] * save_capnp(%r, data)' % (util_path.tail(fpath),))
[docs]def try_decode(x): # All python encoding formats codec_list = [ 'ascii', 'big5', 'big5hkscs', 'cp037', 'cp424', 'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1140', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 'euc_kr', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'latin_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 'iso8859_7', 'iso8859_8', 'iso8859_9', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 'johab', 'koi8_r', 'koi8_u', 'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman', 'mac_turkish', 'ptcp154', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be', 'utf_16_le', 'utf_7', 'utf_8', 'utf_8_sig', ] for codec in codec_list: try: print(('%20s: ' % (codec,)) + x.encode(codec)) except Exception: print(('%20s: ' % (codec,)) + 'FAILED') for codec in codec_list: try: print(('%20s: ' % (codec,)) + x.decode(codec)) except Exception: print(('%20s: ' % (codec,)) + 'FAILED')
if __name__ == '__main__': """ CommandLine: python -m utool.util_io python -m utool.util_io --allexamples python -m utool.util_io --allexamples --noface --nosrc """ import multiprocessing multiprocessing.freeze_support() # for win32 import utool as ut # NOQA ut.doctest_funcs()