Module multiformats.multihash.hashfun

Implementation of raw hash functions used by multihash multicodecs.

Hash functions are implemented using the following libraries:

Core functionality is provided by the exists() and get() functions, which can be used to check whether an implementatino with given name is known, and if so to get the corresponding pair of hash function and max digest size:

>>> multihash.hashfun.exists("sha2-256")
True
>>> multihash.hashfun.get("sha2-256")
(<function _hashlib_sha.<locals>.hashfun at 0x0000013F4A3C6160>, 32)

The hash functions take a single bytes input (the data) and return a bytes output (the hash digest). The max digest sizes (if not None) are used to sense-check hash digests passed to encode() and/or obtained from decode(): telling whether a digest has been generated by a hash function is deemed to be computationally unfeasible in general, but hash digests of length greater than the max digest size can always be discounted as invalid.

Expand source code
"""
    Implementation of raw hash functions used by multihash multicodecs.

    Hash functions are implemented using the following libraries:

    - [`hashlib`](https://docs.python.org/3/library/hashlib.html)
    - [`pyskein`](https://pythonhosted.org/pyskein/)

    Core functionality is provided by the `exists` and `get` functions, which can be used to check
    whether an implementatino with given name is known, and if so to get the corresponding pair
    of hash function and max digest size:

    ```py
    >>> multihash.hashfun.exists("sha2-256")
    True
    >>> multihash.hashfun.get("sha2-256")
    (<function _hashlib_sha.<locals>.hashfun at 0x0000013F4A3C6160>, 32)
    ```

    The hash functions take a single `bytes` input (the data) and return a `bytes` output (the hash digest).
    The max digest sizes (if not `None`) are used to sense-check hash digests passed to `multiformats.multihash.encode`
    and/or obtained from `multiformats.multihash.decode`: telling whether a digest has been generated by a hash function
    is deemed to be computationally unfeasible in general, but hash digests of length greater than the max digest size
    can always be discounted as invalid.
"""

import hashlib
from typing import Callable, Dict, Optional, Tuple
from typing_validation import validate

import skein # type: ignore

from multiformats import multicodec
from multiformats.varint import BytesLike

Hashfun = Callable[[BytesLike], bytes]
_hashfun: Dict[str, Tuple[Hashfun, Optional[int]]] = {}

def get(name: str) -> Tuple[Hashfun, Optional[int]]:
    """
        Given a multihash multicodec name, returns its implementation as a pair of a hash function
        and a max digest size (possibly `None`).
        Raises `KeyError` if no implementation is available for this name.

        ```py
        >>> multihash.hashfun.get("sha2-256")
        (<function _hashlib_sha.<locals>.hashfun at 0x0000013F4A3C6160>, 32)
        ```
    """
    validate(name, str)
    if name not in _hashfun:
        raise KeyError(f"No implementation for multihash multicodec {repr(name)}.")
    return _hashfun[name]

def exists(name: str) -> bool:
    """
        Checks whether the multihash multicodec with given name has an implementation.

        ```py
        >>> multihash.hashfun.exists("sha2-256")
        True
        ```
    """
    validate(name, str)
    return name in _hashfun


def register(name: str, hashfun: Hashfun, digest_size: Optional[int], *, overwrite: bool = False) -> None:
    """
        Registers a hash function and hash digest size implementing the multihash multicodec with given name,
        which must already exist.

        The optional keyword argument `overwrite` (default: `False`) can be used to overwrite an existing implementation.
        If `overwrite` is `False`, raises `ValueError` if an implementation the same name already exists.

        Example usage (from the source code of this module):

        ```py
        register("sha1", _hashlib_sha(1), 20) # max digest size is 20 bytes, i.e. 160 bits
        register(f"sha2-256", _hashlib_sha(2, 256), 256//8)
        ```
    """
    validate(name, str)
    # validate(hashfun, Hashfun) # TODO: not yet supported by typing-validation
    validate(digest_size, Optional[int])
    validate(overwrite, bool)
    if digest_size is not None and digest_size <= 0:
        raise ValueError("Digest size must be positive or None.")
    if not overwrite and name in _hashfun:
        raise ValueError(f"An implementation for the multihash multicodec named {repr(name)} already exists.")
    if name not in _hashfun:
        multihash = multicodec.get(name)
        if multihash.tag != "multihash":
            raise ValueError(f"Multicodec '{multihash.name}' exists, but it is not a multihash multicodec.")
    _hashfun[name] = (hashfun, digest_size)


def unregister(name: str) -> None:
    """
        Unregisters a raw encoding by name.
        Raises `KeyError` if no such raw encoding exists.
    """
    validate(name, str)
    if name not in _hashfun:
        raise KeyError(f"There is no implementation for multihash multicodec with name {repr(name)}.")
    del _hashfun[name]

def _identity(data: BytesLike) -> bytes:
    validate(data, BytesLike)
    return bytes(data)

register("identity", _identity, None)

def _hashlib_sha(version: int, digest_bits: Optional[int] = None) -> Hashfun:
    name = ("sha1", f"sha{digest_bits}", f"sha3_{digest_bits}")[version-1]
    h = getattr(hashlib, name)
    def hashfun(data: BytesLike) -> bytes:
        validate(data, BytesLike)
        m: hashlib._Hash = h() # pylint: disable = no-member
        m.update(data)
        return m.digest()
    return hashfun

register("sha1", _hashlib_sha(1), 20) # 20B = 160 bits

for digest_bits in (256, 512):
    register(f"sha2-{digest_bits}", _hashlib_sha(2, digest_bits), digest_bits//8)

for digest_bits in (224, 256, 384, 512):
    register(f"sha3-{digest_bits}", _hashlib_sha(3, digest_bits), digest_bits//8)

def _hashlib_shake(digest_bits: int) -> Hashfun:
    h = getattr(hashlib, f"shake_{digest_bits//2}")
    def hashfun(data: BytesLike) -> bytes:
        validate(data, BytesLike)
        m: hashlib._Hash = h() # pylint: disable = no-member
        m.update(data)
        return m.digest(digest_bits//8) # type: ignore
    return hashfun

for digest_bits in (256, 512):
    register(f"shake-{digest_bits//2}", _hashlib_shake(digest_bits), digest_bits//8)

def _hashlib_blake2(version: str, digest_bits: int) -> Hashfun:
    h = getattr(hashlib, f"blake2{version}")
    def hashfun(data: BytesLike) -> bytes:
        validate(data, BytesLike)
        m: hashlib._Hash = h(digest_size=digest_bits//8) # pylint: disable = no-member
        m.update(data)
        return m.digest()
    return hashfun

for blake2_version in ("b", "s"):
    for digest_bits in range(8, 513 if blake2_version == "b" else 257, 8):
        register(f"blake2{blake2_version}-{digest_bits}", _hashlib_blake2(blake2_version, digest_bits), digest_bits//8)

def _skein(version: int, digest_bits: int) -> Hashfun:
    h = getattr(skein, f"skein{version}")
    def hashfun(data: BytesLike) -> bytes:
        validate(data, BytesLike)
        m: hashlib._Hash = h(digest_bits=digest_bits) # pylint: disable = no-member
        m.update(data)
        return m.digest()
    return hashfun

for skein_version in (256, 512, 1024):
    for digest_bits in range(8, skein_version+1, 8):
        register(f"skein{skein_version}-{digest_bits}", _skein(skein_version, digest_bits), digest_bits//8)

Functions

def exists(name: str) ‑> bool

Checks whether the multihash multicodec with given name has an implementation.

>>> multihash.hashfun.exists("sha2-256")
True
Expand source code
def exists(name: str) -> bool:
    """
        Checks whether the multihash multicodec with given name has an implementation.

        ```py
        >>> multihash.hashfun.exists("sha2-256")
        True
        ```
    """
    validate(name, str)
    return name in _hashfun
def get(name: str) ‑> Tuple[Callable[[Union[bytes, bytearray, memoryview]], bytes], Optional[int]]

Given a multihash multicodec name, returns its implementation as a pair of a hash function and a max digest size (possibly None). Raises KeyError if no implementation is available for this name.

>>> multihash.hashfun.get("sha2-256")
(<function _hashlib_sha.<locals>.hashfun at 0x0000013F4A3C6160>, 32)
Expand source code
def get(name: str) -> Tuple[Hashfun, Optional[int]]:
    """
        Given a multihash multicodec name, returns its implementation as a pair of a hash function
        and a max digest size (possibly `None`).
        Raises `KeyError` if no implementation is available for this name.

        ```py
        >>> multihash.hashfun.get("sha2-256")
        (<function _hashlib_sha.<locals>.hashfun at 0x0000013F4A3C6160>, 32)
        ```
    """
    validate(name, str)
    if name not in _hashfun:
        raise KeyError(f"No implementation for multihash multicodec {repr(name)}.")
    return _hashfun[name]
def register(name: str, hashfun: Callable[[Union[bytes, bytearray, memoryview]], bytes], digest_size: Optional[None], *, overwrite: bool = False) ‑> None

Registers a hash function and hash digest size implementing the multihash multicodec with given name, which must already exist.

The optional keyword argument overwrite (default: False) can be used to overwrite an existing implementation. If overwrite is False, raises ValueError if an implementation the same name already exists.

Example usage (from the source code of this module):

register("sha1", _hashlib_sha(1), 20) # max digest size is 20 bytes, i.e. 160 bits
register(f"sha2-256", _hashlib_sha(2, 256), 256//8)
Expand source code
def register(name: str, hashfun: Hashfun, digest_size: Optional[int], *, overwrite: bool = False) -> None:
    """
        Registers a hash function and hash digest size implementing the multihash multicodec with given name,
        which must already exist.

        The optional keyword argument `overwrite` (default: `False`) can be used to overwrite an existing implementation.
        If `overwrite` is `False`, raises `ValueError` if an implementation the same name already exists.

        Example usage (from the source code of this module):

        ```py
        register("sha1", _hashlib_sha(1), 20) # max digest size is 20 bytes, i.e. 160 bits
        register(f"sha2-256", _hashlib_sha(2, 256), 256//8)
        ```
    """
    validate(name, str)
    # validate(hashfun, Hashfun) # TODO: not yet supported by typing-validation
    validate(digest_size, Optional[int])
    validate(overwrite, bool)
    if digest_size is not None and digest_size <= 0:
        raise ValueError("Digest size must be positive or None.")
    if not overwrite and name in _hashfun:
        raise ValueError(f"An implementation for the multihash multicodec named {repr(name)} already exists.")
    if name not in _hashfun:
        multihash = multicodec.get(name)
        if multihash.tag != "multihash":
            raise ValueError(f"Multicodec '{multihash.name}' exists, but it is not a multihash multicodec.")
    _hashfun[name] = (hashfun, digest_size)
def unregister(name: str) ‑> None

Unregisters a raw encoding by name. Raises KeyError if no such raw encoding exists.

Expand source code
def unregister(name: str) -> None:
    """
        Unregisters a raw encoding by name.
        Raises `KeyError` if no such raw encoding exists.
    """
    validate(name, str)
    if name not in _hashfun:
        raise KeyError(f"There is no implementation for multihash multicodec with name {repr(name)}.")
    del _hashfun[name]