Source code for utool.util_hash

# -*- coding: utf-8 -*-
"""
Hashing convinience functions

You should opt to use a hash*27 function over a hash* function.


TODO: the same hashing algorithm should be used everywhere
Currently there is a mix of sha1, sha256, and sha512 in different places.

"""
from __future__ import absolute_import, division, print_function, unicode_literals
import hashlib
import copy
import six
import uuid
import random
from utool import util_inject
(print, rrr, profile) = util_inject.inject2(__name__, '[hash]')

# default length of hash codes
HASH_LEN = 16

# HEX alphabet
ALPHABET_16 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
               'a', 'b', 'c', 'd', 'e', 'f']

# A large base-54 alphabet (all chars are valid for filenames but not # pretty)
ALPHABET_54 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
               'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
               'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
               'u', 'v', 'w', 'x', 'y', 'z', ';', '=', '@', '[',
               ']', '^', '_', '`', '{', '}', '~', '!', '#', '$',
               '%', '&', '+', ',']


# A large base-41 alphabet (prettier subset of base 54)
ALPHABET_41 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
               'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
               'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
               'u', 'v', 'w', 'x', 'y', 'z', '@', '!', '%', '&',
               '+']

ALPHABET_27 = [
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


ALPHABET = ALPHABET_41
BIGBASE = len(ALPHABET)


DictProxyType = type(object.__dict__)


[docs]def make_hash(o): r""" Makes a hash from a dictionary, list, tuple or set to any level, that contains only other hashable types (including any lists, tuples, sets, and dictionaries). In the case where other kinds of objects (like classes) need to be hashed, pass in a collection of object attributes that are pertinent. For example, a class can be hashed in this fashion: make_hash([cls.__dict__, cls.__name__]) A function can be hashed like so: make_hash([fn.__dict__, fn.__code__]) References: http://stackoverflow.com/questions/5884066/hashing-a-python-dictionary """ if type(o) == DictProxyType: o2 = {} for k, v in o.items(): if not k.startswith("__"): o2[k] = v o = o2 if isinstance(o, (set, tuple, list)): return tuple([make_hash(e) for e in o]) elif not isinstance(o, dict): return hash(o) new_o = copy.deepcopy(o) for k, v in new_o.items(): new_o[k] = make_hash(v) return hash(tuple(frozenset(sorted(new_o.items()))))
[docs]def hashstr_arr27(arr, lbl, alphabet=ALPHABET_27, **kwargs): return hashstr_arr(arr, lbl, alphabet=alphabet, **kwargs)
[docs]def hashstr27(data, alphabet=ALPHABET_27, **kwargs): return hashstr(data, alphabet=alphabet, **kwargs) #@profile
[docs]def hashstr_arr(arr, lbl='arr', pathsafe=False, **kwargs): r""" Args: arr (ndarray): lbl (str): (default = 'arr') pathsafe (bool): (default = False) Returns: str: arr_hashstr CommandLine: python -m utool.util_hash --exec-hashstr_arr python -m utool.util_hash --test-hashstr_arr Example: >>> # ENABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> import numpy as np >>> arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64) >>> lbl = 'arr' >>> kwargs = {} >>> pathsafe = False >>> arr_hashstr = hashstr_arr(arr, lbl, pathsafe, alphabet=ALPHABET_27) >>> result = ('arr_hashstr = %s' % (str(arr_hashstr),)) >>> print(result) arr_hashstr = arr((2,3)daukyreqnhfejkfs) Example2: >>> # ENABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> import numpy as np >>> arr = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float64) >>> kwargs = {} >>> lbl = 'arr' >>> pathsafe = True >>> arr_hashstr = hashstr_arr(arr, lbl, pathsafe, alphabet=ALPHABET_27) >>> result = ('arr_hashstr = %s' % (str(arr_hashstr),)) >>> print(result) arr_hashstr = arr-_2,3_daukyreqnhfejkfs- """ if isinstance(arr, list): arr = tuple(arr) # force arrays into a tuple for hashability # TODO: maybe for into numpy array instead? tuples might have problems if pathsafe: lbrace1, rbrace1, lbrace2, rbrace2 = '_', '_', '-', '-' else: lbrace1, rbrace1, lbrace2, rbrace2 = '(', ')', '(', ')' if isinstance(arr, tuple): arr_shape = lbrace1 + str(len(arr)) + rbrace1 else: # Arr should be an ndarray here. append info about the ndarray arr_shape = lbrace1 + ','.join(list(map(str, arr.shape))) + rbrace1 arr_hashstr_ = hashstr(arr, **kwargs) arr_hashstr = ''.join([lbl, lbrace2, arr_shape, arr_hashstr_, rbrace2]) return arr_hashstr
if six.PY2: stringlike = (basestring, bytes) if six.PY3: stringlike = (str, bytes) @profile
[docs]def hashstr(data, hashlen=HASH_LEN, alphabet=ALPHABET): """ python -c "import utool as ut; print(ut.hashstr('abcd'))" Args: data (hashable): hashlen (int): (default = 16) alphabet (list): list of characters: Returns: str: hashstr CommandLine: python -m utool.util_hash --test-hashstr python3 -m utool.util_hash --test-hashstr Example: >>> # ENABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> data = 'foobar' >>> hashlen = 16 >>> alphabet = ALPHABET >>> hashstr = hashstr(data, hashlen, alphabet) >>> result = ('hashstr = %s' % (str(hashstr),)) >>> print(result) hashstr = mi5yum60mbxhyp+x Example: >>> # ENABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> data = '' >>> hashlen = 16 >>> alphabet = ALPHABET >>> hashstr = hashstr(data, hashlen, alphabet) >>> result = ('hashstr = %s' % (str(hashstr),)) >>> print(result) hashstr = 0000000000000000 """ if isinstance(data, tuple): data = repr(data) # Hack? if six.PY3 and isinstance(data, str): # convert unicode into bytes data = data.encode('utf-8') if isinstance(data, stringlike) and len(data) == 0: # Make a special hash for empty data hashstr = (alphabet[0] * hashlen) else: # Get a 128 character hex string hashstr = hashlib.sha512(data).hexdigest() #if six.PY3: # Shorten length of string (by increasing base) hashstr2 = convert_hexstr_to_bigbase(hashstr, alphabet, bigbase=len(alphabet)) # Truncate hashstr = hashstr2[:hashlen] return hashstr
""" def valid_filename_ascii_chars(): # Find invalid chars ntfs_inval = '< > : " / \ | ? *'.split(' ') other_inval = [' ', '\'', '.'] #case_inval = map(chr, range(97, 123)) case_inval = map(chr, range(65, 91)) invalid_chars = set(ntfs_inval + other_inval + case_inval) # Find valid chars valid_chars = [] for index in range(32, 127): char = chr(index) if not char in invalid_chars: print index, chr(index) valid_chars.append(chr(index)) return valid_chars valid_filename_ascii_chars() """
[docs]def convert_hexstr_to_bigbase(hexstr, alphabet=ALPHABET, bigbase=BIGBASE): """ Packs a long hexstr into a shorter length string with a larger base """ x = int(hexstr, 16) # first convert to base 16 if x == 0: return '0' sign = 1 if x > 0 else -1 x *= sign digits = [] while x: digits.append(alphabet[x % bigbase]) x //= bigbase if sign < 0: digits.append('-') digits.reverse() newbase_str = ''.join(digits) return newbase_str
[docs]def hashstr_md5(data): hashstr = hashlib.md5(data).hexdigest() #bin(int(my_hexdata, scale)) return hashstr
[docs]def hashstr_sha1(data, base10=False): hashstr = hashlib.sha1(data).hexdigest() if base10: hashstr = int("0x" + hashstr, 0) return hashstr
[docs]def get_file_hash(fpath, blocksize=65536, hasher=None, stride=1): r""" For better hashes use hasher=hashlib.sha256, and keep stride=1 Args: fpath (str): file path string blocksize (int): 2 ** 16. Affects speed of reading file hasher (None): defaults to sha1 for fast (but insecure) hashing stride (int): strides > 1 skip data to hash, useful for faster hashing, but less accurate, also makes hash dependant on blocksize. References: http://stackoverflow.com/questions/3431825/generating-a-md5-checksum-of-a-file http://stackoverflow.com/questions/5001893/when-should-i-use-sha-1-and-when-should-i-use-sha-2 CommandLine: python -m utool.util_hash --test-get_file_hash python -m utool.util_hash --test-get_file_hash:0 python -m utool.util_hash --test-get_file_hash:1 Example: >>> # DISABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> fpath = ut.grab_test_imgpath('patsy.jpg') >>> #blocksize = 65536 # 2 ** 16 >>> blocksize = 2 ** 16 >>> hasher = None >>> stride = 1 >>> hashbytes_20 = get_file_hash(fpath, blocksize, hasher, stride) >>> result = repr(hashbytes_20) >>> print(result) '7\x07B\x0eX<sRu\xa2\x90P\xda\xb2\x84?\x81?\xa9\xd9' '\x13\x9b\xf6\x0f\xa3QQ \xd7"$\xe9m\x05\x9e\x81\xf6\xf2v\xe4' '\x16\x00\x80Xx\x8c-H\xcdP\xf6\x02\x9frl\xbf\x99VQ\xb5' Example: >>> # DISABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> #fpath = ut.grab_file_url('http://en.wikipedia.org/wiki/List_of_comets_by_type') >>> fpath = ut.unixjoin(ut.ensure_app_resource_dir('utool'), 'tmp.txt') >>> ut.write_to(fpath, ut.lorium_ipsum()) >>> blocksize = 2 ** 3 >>> hasher = None >>> stride = 2 >>> hashbytes_20 = get_file_hash(fpath, blocksize, hasher, stride) >>> result = repr(hashbytes_20) >>> print(result) '5KP\xcf>R\xf6\xffO:L\xac\x9c\xd3V+\x0e\xf6\xe1n' Ignore: file_ = open(fpath, 'rb') """ if hasher is None: hasher = hashlib.sha1() with open(fpath, 'rb') as file_: buf = file_.read(blocksize) while len(buf) > 0: hasher.update(buf) if stride > 1: file_.seek(blocksize * (stride - 1), 1) # skip blocks buf = file_.read(blocksize) return hasher.digest()
[docs]def get_file_uuid(fpath, hasher=None, stride=1): """ Creates a uuid from the hash of a file """ if hasher is None: hasher = hashlib.sha1() # 20 bytes of output #hasher = hashlib.sha256() # 32 bytes of output # sha1 produces a 20 byte hash hashbytes_20 = get_file_hash(fpath, hasher=hasher, stride=stride) # sha1 produces 20 bytes, but UUID requires 16 bytes hashbytes_16 = hashbytes_20[0:16] uuid_ = uuid.UUID(bytes=hashbytes_16) return uuid_
[docs]def image_uuid(pil_img): """ UNSAFE: DEPRICATE: JPEG IS NOT GAURENTEED TO PRODUCE CONSITENT VALUES ON MULTIPLE MACHINES image global unique id References: http://stackoverflow.com/questions/23565889/jpeg-images-have-different-pixel-values-across-multiple-devices """ print('WARNING DO NOT USE utool.util_hash.image_uuid UNSAFE AND DEPRICATED') # Get the bytes of the image img_bytes_ = pil_img.tobytes() uuid_ = hashable_to_uuid(img_bytes_) return uuid_
[docs]def augment_uuid(uuid_, *hashables): #from six.moves import reprlib #uuidhex_data = uuid_.get_bytes() uuidhex_data = uuid_.bytes #hashable_str = ''.join(map(repr, hashables)) # Python 2 and 3 diverge here because repr returns # ascii data in python2 and unicode text in python3 # it would be nice to if six.PY2: hashable_text = ''.join(map(repr, hashables)) hashable_data = hashable_text.encode('utf-8') #hashable_data = b''.join(map(bytes, hashables)) elif six.PY3: hashable_text = ''.join(map(repr, hashables)) hashable_data = hashable_text.encode('utf-8') #hashable_data = b''.join(map(bytes, hashables)) augmented_data = uuidhex_data + hashable_data augmented_uuid_ = hashable_to_uuid(augmented_data) return augmented_uuid_
[docs]def hashable_to_uuid(hashable_): """ TODO: ensure that python2 and python3 agree on hashes of the same information Args: hashable_ (hashable): hashables are bytes-like objects An object that supports the Buffer Protocol, like bytes, bytearray or memoryview. Bytes-like objects can be used for various operations that expect binary data, such as compression, saving to a binary file or sending over a socket. Some operations need the binary data to be mutable, in which case not all bytes-like objects can apply. Returns: UUID: uuid_ CommandLine: python -m utool.util_hash --test-hashable_to_uuid python3 -m utool.util_hash --test-hashable_to_uuid:0 Example0: >>> # ENABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> hashable_ = 'foobar' >>> uuid_ = hashable_to_uuid(hashable_) >>> result = str(uuid_) >>> print(result) 8843d7f9-2416-211d-e9eb-b963ff4ce281 Example1: >>> # ENABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> hashable_ = u'foobar' >>> uuid_ = hashable_to_uuid(hashable_) >>> result = str(uuid_) >>> print(result) 8843d7f9-2416-211d-e9eb-b963ff4ce281 Example2: >>> # ENABLE_DOCTEST >>> from utool.util_hash import * # NOQA >>> hashable_ = 10 >>> uuid_ = hashable_to_uuid(hashable_) >>> result = str(uuid_) >>> print(result) b1d57811-11d8-4f7b-3fe4-5a0852e59758 """ # Hash the bytes #try: #print('hashable_=%r' % (hashable_,)) if six.PY3: # If hashable_ is text (python3) if isinstance(hashable_, bytes): bytes_ = hashable_ if isinstance(hashable_, str): bytes_ = hashable_.encode('utf-8') #print('sbytes=%r' % (bytes_,)) else: #bytes_ = bytearray(hashable_) #bytes_ = bytes(hashable_) bytes_ = repr(hashable_).encode('utf-8') #print('bytes_=%r' % (bytes_,)) elif six.PY2: # If hashable_ is data (python2) if isinstance(hashable_, bytes): bytes_ = hashable_ elif isinstance(hashable_, str): bytes_ = hashable_.encode('utf-8') else: bytes_ = bytes(hashable_) #print('bytes=%r' % (bytes_,)) bytes_sha1 = hashlib.sha1(bytes_) #except Exception as ex: # import utool # utool.printex(ex, keys=[(type, 'bytes_')]) # raise # Digest them into a hash #hashstr_40 = img_bytes_sha1.hexdigest() #hashstr_32 = hashstr_40[0:32] hashbytes_20 = bytes_sha1.digest() hashbytes_16 = hashbytes_20[0:16] uuid_ = uuid.UUID(bytes=hashbytes_16) return uuid_
[docs]def deterministic_uuid(hashable): return hashable_to_uuid(hashable)
[docs]def random_uuid(): return uuid.uuid4()
[docs]def random_nonce(length=64, alphabet=None): """ returns a random string of len=<length> from <alphabet> I have no idea why this is named random_nonce """ assert length > 0 if alphabet is None: alphabet = ALPHABET_16 return ''.join( [alphabet[random.randint(0, len(alphabet) - 1)] for _ in range(length)] )
[docs]def get_zero_uuid(): return uuid.UUID('00000000-0000-0000-0000-000000000000') # Cleanup namespace
del ALPHABET_41 del ALPHABET_54 if __name__ == '__main__': """ CommandLine: python -m utool.util_hash python -m utool.util_hash --allexamples python3 -m utool.util_hash --allexamples python -m utool.util_hash --allexamples --noface --nosrc """ import multiprocessing multiprocessing.freeze_support() # for win32 import utool as ut # NOQA ut.doctest_funcs()