Module src.jsonid.analysis

Simple module to analys JSON for technical metadata.

Functions

async def analyse_all_types(data: Any, all_depths: bool = False)
Expand source code
async def analyse_all_types(data: Any, all_depths: bool = False):
    """Analyse types at all levels of the object to provide an
    indication of overall complexity, and to provide some idea to
    signature devs about what data to test for. Defaults to just the
    top level to reduce complexity.
    """
    try:
        values = data.values()
    except AttributeError:
        if not isinstance(data, list):
            return [type(data)]
        values = data
    types = []
    for item in values:
        types.append(helpers.substitute_type_text(type(item)))
        if not all_depths:
            continue
        if isinstance(item, list):
            type_ = await analyse_all_types(item, all_depths)
            types.append(type_)
        if isinstance(item, dict):
            type_ = await analyse_all_types(item, all_depths)
            types.append(type_)
    return types

Analyse types at all levels of the object to provide an indication of overall complexity, and to provide some idea to signature devs about what data to test for. Defaults to just the top level to reduce complexity.

async def analyse_depth(data: Any) ‑> int
Expand source code
async def analyse_depth(data: Any) -> int:
    """Calculate the depth of the structure.

    Depth is the maximum depth of complex list and dict data-types. NB.
    all list, and dict types are computed to generate the output.
    """
    try:
        values = data.values()
    except AttributeError:
        if not isinstance(data, list):
            return 1
        values = data
    depths = [0]
    for item in values:
        depth = 0
        if isinstance(item, (dict, list)):
            depth = depth + 1
            depth = depth + await analyse_depth(item)
            depths.append(depth)
    return max(depths)

Calculate the depth of the structure.

Depth is the maximum depth of complex list and dict data-types. NB. all list, and dict types are computed to generate the output.

async def analyse_input(data: Any, content: str, all_depths: bool = False)
Expand source code
async def analyse_input(data: Any, content: str, all_depths: bool = False):
    """Analyse a given input and output statistics, e.g.

    * No. keys at top level.
    * Key-types at different depths.
    * Identify heterogeneous lists.
    * Depth of complex objects, i.e. nested dicts and lists.

    """

    keys = data.keys()
    depth = await analyse_depth(data)
    content_length = len(content)
    lines = content.count("\n")
    line_warning = False
    if lines == 1 and len(content) > LINE_LENGTH:
        line_warning = True
    top_level_types = await analyse_all_types(data, all_depths)
    heterogenerous_types = await analyse_list_types(data)

    return {
        "content_length": content_length,
        "number_of_lines": lines,
        "line_warning": line_warning,
        "top_level_keys_count": len(keys),
        "top_level_keys": list(keys),
        "top_level_types": top_level_types,
        "depth": depth,
        "heterogeneous_list_types": heterogenerous_types,
        "fingerprint": await fingerprint(data),
    }

Analyse a given input and output statistics, e.g.

  • No. keys at top level.
  • Key-types at different depths.
  • Identify heterogeneous lists.
  • Depth of complex objects, i.e. nested dicts and lists.
async def analyse_list_types(data: Any) ‑> bool
Expand source code
async def analyse_list_types(data: Any) -> bool:
    """Return information about the complexity of list objects to
    provide some indivator to developers about when they are
    prociessing lists of odd-complexity, e.g. when processing a list
    of integers, SHOULD you normally have to expect a list type, bool,
    or something  else?

    NB. I have a sense _get_list_types() can be combined with this
    function but maybe at the risk of complexity? (or it simplifies
    things?)

    At time of writing, the current method works quite niccely. Looks
    good, might delete later.
    """
    try:
        values = data.values()
    except AttributeError:
        if not isinstance(data, list):
            return False
        values = data
    complexity = []
    for item in values:
        if isinstance(item, list):
            # Process list, but if list contains a list, we need to
            # recurse.
            complex_types = await _get_list_types(item)
            complexity.append(complex_types)
        if isinstance(item, dict):
            complex_types = await analyse_list_types(item)
            complexity.append(complex_types)
    return True in complexity

Return information about the complexity of list objects to provide some indivator to developers about when they are prociessing lists of odd-complexity, e.g. when processing a list of integers, SHOULD you normally have to expect a list type, bool, or something else?

NB. I have a sense _get_list_types() can be combined with this function but maybe at the risk of complexity? (or it simplifies things?)

At time of writing, the current method works quite niccely. Looks good, might delete later.

async def fingerprint(data: Any)
Expand source code
async def fingerprint(data: Any):
    """fingerprint the json data

    Useful thoughts on normalizing a json-like data structure:

       * https://stackoverflow.com/a/22003440/23789970
    """
    return {
        "unf": await unf_fingerprint(data),
        "cid": await ipld_cid(data),
    }

fingerprint the json data

Useful thoughts on normalizing a json-like data structure:

async def ipld_cid(data: Any)
Expand source code
async def ipld_cid(data: Any):
    """Create a IPLD compatible content identifier.

    Dissect the CID here: https://cid.ipfs.tech/
    """

    d = json.dumps(data, sort_keys=True)
    digest = hashlib.sha256(d.encode()).hexdigest()
    cid_hash = cid_sha256_hash(digest.encode())
    return cid_hash

Create a IPLD compatible content identifier.

Dissect the CID here: https://cid.ipfs.tech/

async def unf_fingerprint(data: Any) ‑> str
Expand source code
async def unf_fingerprint(data: Any) -> str:
    """Calculate Universal Numerical Fingerprint (UNF) for the data.

    UNF: https://guides.dataverse.org/en/latest/developers/unf/index.html
    """
    res = None
    try:
        res = unf.unf(data)
    except TypeError:
        data_str = json.dumps(data, sort_keys=True)
        res = unf.unf(data_str)
    return res

Calculate Universal Numerical Fingerprint (UNF) for the data.

UNF: https://guides.dataverse.org/en/latest/developers/unf/index.html