Module data_request_api.stable.content.consolidate_export

Functions

def map_data(data, mapping_table)
Expand source code
def map_data(data, mapping_table):
    """
    Maps the data to the one-base structure using the mapping table.

    Parameters
    ----------
    data : dict
        Three-base or one-base Airtable export.
    mapping_table dict
        The mapping table to apply to map to one base.

    Returns
    -------
    dict
        Mapped data with one-base structure.

    Note
    ----
        Returns the input dict if the data is already one-base.
    """
    logger = get_logger()
    missing_bases = []
    missing_tables = []
    mapped_data = {"Data Request": {}}

    # Reset filtered records
    global filtered_records
    if filtered_records:
        filtered_records = []

    if len(data.keys()) in [3, 4]:

        # Get filtered records
        for table, mapinfo in mapping_table.items():
            if (
                mapinfo["source_base"] in data
                and mapinfo["source_table"] in data[mapinfo["source_base"]]
            ):
                if "internal_filters" in mapinfo:
                    for record_id, record in data[mapinfo["source_base"]][
                        mapinfo["source_table"]
                    ]["records"].items():
                        filter_results = []
                        for filter_key, filter_val in mapinfo[
                            "internal_filters"
                        ].items():
                            if filter_key not in record:
                                filter_results.append(False)
                            elif filter_val["operator"] == "nonempty":
                                filter_results.append(bool(record[filter_key]))
                            elif filter_val["operator"] == "in":
                                if isinstance(record[filter_key], list):
                                    filter_results.append(
                                        any(
                                            fj in filter_val["values"]
                                            for fj in record[filter_key]
                                        )
                                    )
                                else:
                                    filter_results.append(
                                        record[filter_key] in filter_val["values"]
                                    )
                            elif filter_val["operator"] == "not in":
                                if isinstance(record[filter_key], list):
                                    filter_results.append(
                                        any(
                                            fj not in filter_val["values"]
                                            for fj in record[filter_key]
                                        )
                                    )
                                else:
                                    filter_results.append(
                                        record[filter_key] not in filter_val["values"]
                                    )
                        if not all(filter_results):
                            logger.debug(
                                f"Filtered out record '{record_id}' {'('+record['name']+')' if 'name' in record else ''} from '{table}'."
                            )
                            filtered_records.append(record_id)
        logger.info(f"Filtered {len(filtered_records)} records.")

        # Perform mapping in case of three-base structure
        for table, mapinfo in mapping_table.items():
            intm = mapinfo["internal_mapping"]
            if (
                mapinfo["source_base"] in data
                and mapinfo["source_table"] in data[mapinfo["source_base"]]
            ):
                # Copy the selected data to the one-base structure
                logger.debug(f"Mapping '{mapinfo['source_base']}' -> '{table}'")
                mapped_data["Data Request"][table] = {
                    **data[mapinfo["source_base"]][mapinfo["source_table"]],
                    "records": {
                        record_id: record
                        for record_id, record in data[mapinfo["source_base"]][
                            mapinfo["source_table"]
                        ]["records"].items()
                        if record_id not in filtered_records
                    },
                }

                # If record attributes require mapping
                if intm != {}:
                    # for each attribute that requires mapping
                    for attr in intm.keys():
                        for record_id, record in data[mapinfo["source_base"]][
                            mapinfo["source_table"]
                        ]["records"].items():
                            if (
                                attr not in record
                                or record[attr] is None
                                or record[attr] == ""
                                or record[attr] == []
                            ):
                                logger.debug(
                                    f"{table}: Attribute '{attr}' not found for record '{record_id}'."
                                )
                                continue
                            attr_vals = record[attr]

                            # operation
                            if intm[attr]["operation"] == "split":
                                attr_vals = re.split(r"\s*,\s*", attr_vals)
                            elif intm[attr]["operation"] == "":
                                if isinstance(attr_vals, str):
                                    attr_vals = [attr_vals]
                            else:
                                raise ValueError(
                                    f"Unknown internal mapping operation for attribute '{attr}' ('{mapinfo['source_table']}'): '{intm[attr]['operation']}'"
                                )

                            # Get mapped record_ids
                            # entry_type - single record_id or list of record_ids
                            # - map by record_id
                            if intm[attr]["entry_type"] == "record_id":
                                if not intm[attr]["base_copy_of_table"]:
                                    raise ValueError(
                                        "A copy of the table in the same base is required if 'entry_type' is set to 'record_id', "
                                        f"but 'base_copy_of_table' is set to False: '{mapinfo['source_table']}' - '{attr}'"
                                    )
                                elif not intm[attr]["base"] in data:
                                    raise KeyError(
                                        f"Base '{intm[attr]['base']}' not found in data."
                                    )
                                elif (
                                    intm[attr]["base_copy_of_table"]
                                    not in data[mapinfo["source_base"]]
                                ):
                                    raise KeyError(
                                        f"Table '{intm[attr]['table']}' not found in base '{intm[attr]['base_copy']}'."
                                    )
                                recordIDs_new = []
                                for attr_val in attr_vals:
                                    # The record copy in the current base
                                    record_copy = data[mapinfo["source_base"]][
                                        intm[attr]["base_copy_of_table"]
                                    ]["records"][attr_val]
                                    # The entire list of records in the base of origin
                                    recordlist = data[intm[attr]["base"]][
                                        intm[attr]["table"]
                                    ]["records"]
                                    recordID_new = _map_record_id(
                                        record_copy,
                                        recordlist,
                                        intm[attr]["map_by_key"],
                                    )
                                    if recordID_new:
                                        recordIDs_new.append(recordID_new)
                            # entry_type - name (eg. unique label or similar)
                            # - map by attribute value
                            elif intm[attr]["entry_type"] == "name":
                                recordIDs_new = []
                                for attr_val in attr_vals:
                                    recordID_new = _map_attribute(
                                        attr_val,
                                        data[intm[attr]["base"]][intm[attr]["table"]][
                                            "records"
                                        ],
                                        (
                                            intm[attr]["map_by_key"]
                                            if isinstance(intm[attr]["map_by_key"], str)
                                            else intm[attr]["map_by_key"][0]
                                        ),
                                    )
                                    if recordID_new:
                                        recordIDs_new.append(recordID_new)
                            else:
                                raise ValueError(
                                    f"Unknown 'entry_type' specified for attribute '{attr}' ('{mapinfo['source_table']}'): '{intm[attr]['entry_type']}'"
                                )
                            if not recordIDs_new:
                                raise KeyError(
                                    f"{table} (record '{record_id}'): For attribute '{attr}' no records could be mapped."
                                )
                            mapped_data["Data Request"][table]["records"][record_id][
                                attr
                            ] = recordIDs_new

            else:
                if mapinfo["source_base"] not in data:
                    missing_tables.append(mapinfo["source_base"])
                elif mapinfo["source_table"] not in data[mapinfo["source_base"]]:
                    missing_bases.append(mapinfo["source_table"])
        if len(missing_bases) > 0:
            warnings.warn(
                f"Encountered missing bases when consolidating the data: {set(missing_bases)}"
            )
        if len(missing_tables) > 0:
            warnings.warn(
                f"Encountered missing tables when consolidating the data: {missing_tables}"
            )
        return mapped_data
    # Return the data if it is already one-base
    elif len(data.keys()) == 1:
        version = next(iter(data.keys())).replace("Data Request ", "")
        mapped_data = next(iter(data.values()))
        if version in version_consistency:
            for tfrom, tto in version_consistency[version].items():
                logger.debug(
                    f"Consistency across versions - renaming table: {tfrom} -> {tto}"
                )
                mapped_data[tto] = mapped_data.pop(tfrom)
        return {"Data Request": mapped_data}
    else:
        raise ValueError("The loaded Data Request has an unexpected data structure.")

Maps the data to the one-base structure using the mapping table.

Parameters

data : dict
Three-base or one-base Airtable export.

mapping_table dict The mapping table to apply to map to one base.

Returns

dict
Mapped data with one-base structure.

Note

Returns the input dict if the data is already one-base.
def transform_content(data)
Expand source code
def transform_content(data):
    """
    Transform the data request content into a tidy format.

    This function takes the data request content as input, tidies it up by removing
    unnecessary keys and renaming others, and returns the transformed data request
    and vocabulary server.

    Parameters:
    data (dict): The data request content to be transformed.

    Returns:
    tuple: A tuple containing the transformed data request and vocabulary server.
    """
    logger = get_logger()
    global default_count

    # Create an index to map record IDs to UIDs
    record_to_uid_index = dict()
    # Separate dreq and vocabulary information
    data_request = dict()
    vocabulary_server = dict()
    # Get the content of the Data Request
    content = data["Data Request"]

    # Define the keys to remove from each table
    to_remove_keys = {}

    # Iterate over each table in the content
    for subelt in sorted(list(content)):
        for record_id in sorted(list(content[subelt]["records"])):
            # Get the keys to remove for this table
            if subelt in to_remove_keys:
                keys_to_remove = to_remove_keys[subelt]
            else:
                keys_to_remove = list()

            # Get the list of keys for this record
            list_keys = list(content[subelt]["records"][record_id])

            # Add keys that match certain patterns to the list of keys to remove
            keys_to_remove.extend(
                [
                    key
                    for key in list_keys
                    if "(MJ)" in key
                    or "test" in key.lower()
                    or ("last" in key.lower() and "modified" in key.lower())
                    or "count" in key.lower()
                ]
            )

            # Remove the keys that should be removed
            for key in set(keys_to_remove) & set(list_keys):
                del content[subelt]["records"][record_id][key]

            # Rename the "UID" key to "uid" if it exists
            if "UID" in list_keys:
                content[subelt]["records"][record_id]["uid"] = content[subelt][
                    "records"
                ][record_id].pop("UID")
            elif "uid" not in list_keys:
                # If no "uid" key exists, create a default one
                uid = default_template.format(default_count)
                content[subelt]["records"][record_id]["uid"] = uid
                default_count += 1
                logger.debug(
                    f"Undefined uid for element {os.sep.join([subelt, 'records', record_id])}, set {uid}"
                )

            # Add the record ID to UID mapping to the index
            record_to_uid_index[record_id] = content[subelt]["records"][record_id][
                "uid"
            ]
            if (
                subelt
                in [
                    "Opportunity",
                ]
                and "Title of Opportunity" in list_keys
            ):
                content[subelt]["records"][record_id]["name"] = content[subelt][
                    "records"
                ][record_id].pop("Title of Opportunity")
            elif "name" not in list_keys and "Name" not in list_keys:
                content[subelt]["records"][record_id]["name"] = "undef"

    # Replace record_id by uid
    logger.debug("Replace record ids by uids")
    content_string = json.dumps(content)
    for record_id, uid in record_to_uid_index.items():
        content_string = content_string.replace(f'"{record_id}"', f'"{uid}"')
    content = json.loads(content_string)

    # Alternative
    # for key, value in content.items():
    #    if isinstance(value, dict):
    #        content[key] = {record_to_uid_index.get(k, k): v for k, v in value.items()}
    #    elif isinstance(value, list):
    #        content[key] = [{record_to_uid_index.get(k, k): v for k, v in item.items()} if isinstance(item, dict) else item for item in value]

    # Build the data request
    logger.debug("Build DR and VS")
    for subelt in sorted(list(content)):
        if subelt in [
            "Opportunity",
        ]:
            new_subelt = "opportunities"
            data_request[new_subelt] = dict()
            vocabulary_server[new_subelt] = dict()
            for uid in content[subelt]["records"]:
                value = content[subelt]["records"][uid]
                data_request[new_subelt][uid] = dict(
                    experiments_groups=value.pop("Experiment Groups", list()),
                    variables_groups=value.pop("Variable Groups", list()),
                    themes=value.pop("Themes", list()),
                    ensemble_size=value.pop("Ensemble Size", 1),
                )
                vocabulary_server[new_subelt][uid] = value
        elif subelt in [
            "Variable Group",
        ]:
            new_subelt = "variable_groups"
            data_request[new_subelt] = dict()
            vocabulary_server[new_subelt] = dict()
            for uid in content[subelt]["records"]:
                value = content[subelt]["records"][uid]
                data_request[new_subelt][uid] = dict(
                    variables=value.pop("Variables", list()),
                    mips=value.pop("MIPs", list()),
                    priority=value.pop("Priority Level", None),
                )
                vocabulary_server[new_subelt][uid] = value
        elif subelt in [
            "Experiment Group",
        ]:
            new_subelt = "experiment_groups"
            data_request[new_subelt] = dict()
            vocabulary_server[new_subelt] = dict()
            for uid in content[subelt]["records"]:
                value = content[subelt]["records"][uid]
                data_request[new_subelt][uid] = dict(
                    experiments=value.pop("Experiments", list())
                )
                vocabulary_server[new_subelt][uid] = value
        else:
            vocabulary_server[subelt] = content[subelt]["records"]
    return data_request, vocabulary_server

Transform the data request content into a tidy format.

This function takes the data request content as input, tidies it up by removing unnecessary keys and renaming others, and returns the transformed data request and vocabulary server.

Parameters: data (dict): The data request content to be transformed.

Returns: tuple: A tuple containing the transformed data request and vocabulary server.