Module src.jsonid.file_processing

File processing functions.

Functions

async def analyse_data(path: str) ‑> list
Expand source code
async def analyse_data(path: str) -> list:
    """Process all objects at a given path."""
    logger.debug("processing: %s", path)
    res = []
    if "*" in path:
        paths = await process_glob(path)
        res = await analyse_json(paths)
        await output_analysis(res)
        sys.exit()
    if not os.path.exists(path):
        logger.error("path: '%s' does not exist", path)
        sys.exit(1)
    if os.path.isfile(path):
        res = await analyse_json([path])
        await output_analysis(res)
    paths = await create_manifest(path)
    if not paths:
        logger.info("no files in directory: %s", path)
        sys.exit(1)
    res = await analyse_json(paths)
    await output_analysis(res)
    sys.exit()

Process all objects at a given path.

async def analyse_json(paths: list[str])
Expand source code
async def analyse_json(paths: list[str]):
    """Analyse a JSON object."""
    analysis_res = []
    for path in paths:
        if os.path.getsize(path) == 0:
            logger.debug("'%s' is an empty file")
            continue
        valid, data, encoding, content = await identify_plaintext_bytestream(path, True)
        if not valid:
            logger.debug("%s: is not plaintext", path)
            continue
        if data == "":
            continue
        res = await analysis.analyse_input(data, content)
        res["encoding"] = encoding
        analysis_res.append(res)
    return analysis_res

Analyse a JSON object.

async def create_manifest(path: str) ‑> list[str]
Expand source code
async def create_manifest(path: str) -> list[str]:
    """Get a list of paths to process."""
    paths = []
    for root, _, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            logger.debug(file_path)
            paths.append(file_path)
    return paths

Get a list of paths to process.

def decode(content: str)
Expand source code
def decode(content: str):
    """Decode the given content stream."""
    data = ""
    try:
        data = json.loads(content)
    except json.decoder.JSONDecodeError as err:
        logger.debug("(decode) can't process: %s", err)
        return False, None
    return True, data

Decode the given content stream.

def get_date_time() ‑> str
Expand source code
def get_date_time() -> str:
    """Return a datetime string for now(),"""
    return datetime.datetime.now(timezone.utc).strftime(version.UTC_TIME_FORMAT)

Return a datetime string for now(),

async def identify_json(paths: list[str], binary: bool, simple: bool)
Expand source code
async def identify_json(paths: list[str], binary: bool, simple: bool):
    """Identify objects."""
    for idx, path in enumerate(paths):
        if os.path.getsize(path) == 0:
            logger.debug("'%s' is an empty file")
            if binary:
                logger.warning("report on binary object...")
            continue
        valid, data, encoding, _ = await identify_plaintext_bytestream(path)
        if not valid:
            logger.debug("%s: is not plaintext", path)
            if binary:
                logger.warning("report on binary object...")
            continue
        if data == "":
            continue
        logger.debug("processing: %s", path)
        res = registry.matcher(data, encoding=encoding)
        if simple:
            for item in res:
                name_ = item.name[0]["@en"]
                version_ = item.version
                if version_ is not None:
                    name_ = f"{name_}: {version_}"
                print(
                    json.dumps(
                        {
                            "identifier": item.identifier,
                            "filename": os.path.basename(path),
                            "encoding": item.encoding,
                        }
                    )
                )
            continue
        if idx == 0:
            print("---")
            print(version_header())
            print("---")
        print(f"file: {path}")
        for item in res:
            print(item)
        print("---")

Identify objects.

async def output_analysis(res: list) ‑> None
Expand source code
async def output_analysis(res: list) -> None:
    """Format the output of the analysis."""
    for item in res:
        print(json.dumps(item, indent=2))

Format the output of the analysis.

async def process_data(path: str, binary: bool, simple: bool)
Expand source code
async def process_data(path: str, binary: bool, simple: bool):
    """Process all objects at a given path."""
    logger.debug("processing: %s", path)

    if "*" in path:
        paths = await process_glob(path)
        await identify_json(paths, binary, simple)
        sys.exit(0)
    if not os.path.exists(path):
        logger.error("path: '%s' does not exist", path)
        sys.exit(1)
    if os.path.isfile(path):
        await identify_json([path], binary, simple)
        sys.exit(0)
    paths = await create_manifest(path)
    if not paths:
        logger.info("no files in directory: %s", path)
        sys.exit(1)
    await identify_json(paths, binary, simple)

Process all objects at a given path.

async def process_glob(glob_path: str)
Expand source code
async def process_glob(glob_path: str):
    """Process glob patterns provided by the user."""
    paths = []
    for path in glob.glob(glob_path):
        if os.path.isdir(path):
            paths = paths + await create_manifest(path)
        if os.path.isfile(path):
            paths.append(path)
    return paths

Process glob patterns provided by the user.

async def text_check(chars: str) ‑> bool
Expand source code
async def text_check(chars: str) -> bool:
    """Check the first characters of the file to figure out if the
    file is text. Return `True` if the file is text, i.e. no binary
    bytes are detected.

    via. https://stackoverflow.com/a/7392391
    """
    text_chars = bytearray(
        {0, 7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F}
    )
    for char in chars:
        is_binary = bool(chr(char).encode().translate(None, text_chars))
        if is_binary is True:
            return False
    return True

Check the first characters of the file to figure out if the file is text. Return True if the file is text, i.e. no binary bytes are detected.

via. https://stackoverflow.com/a/7392391

def version_header() ‑> str
Expand source code
def version_header() -> str:
    """Output a formatted version header."""
    return f"""jsonid: {version.get_version()}
scandate: {get_date_time()}""".strip()

Output a formatted version header.