wallaroo.inference_decode

  1from numbers import Number
  2from typing import Any, Dict, List
  3
  4import numpy as np
  5import pandas as pd
  6
  7
  8def convert_to_np_dtype(dtype):
  9    if dtype == "Bool":
 10        return np.bool_
 11    elif dtype == "Int8":
 12        return np.int8
 13    elif dtype == "Int16":
 14        return np.int16
 15    elif dtype == "Int32":
 16        return np.int32
 17    elif dtype == "Int64":
 18        return np.int64
 19    elif dtype == "Uint8":
 20        return np.uint8
 21    elif dtype == "Uint16":
 22        return np.uint16
 23    elif dtype == "Uint32":
 24        return np.uint32
 25    elif dtype == "Uint64":
 26        return np.uint64
 27    elif dtype == "Half":
 28        return np.float16
 29    elif dtype == "Float" or "Double":
 30        return np.float64
 31    elif dtype == "Complex64":
 32        return np.complex64
 33    elif dtype == "Complex128":
 34        return np.complex128
 35    elif dtype == "BYTES":
 36        return np.bytes_
 37    return None
 38
 39
 40def to_nd_array_list(outputs: List[Dict[str, Any]]) -> List[np.ndarray]:
 41    res: List[np.ndarray] = []
 42    for output in outputs:
 43        if "dim" in output and output["dim"] is not None:
 44            res.append(
 45                np.ndarray(
 46                    shape=tuple(output["dim"]),
 47                    buffer=np.array(output["data"]),
 48                    dtype=convert_to_np_dtype(output["dtype"]),
 49                )
 50            )
 51        else:
 52            res.append(np.array(output["data"]))
 53    return res
 54
 55
 56def decode_inference_result(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
 57    """
 58    Decode inference results. Since they have a potentially rich structure, this could become a substantial effort in the future.
 59
 60    TODO: Support multiple outputs
 61    TODO: Support multiple data types
 62    """
 63    outputs = []
 64    i = -1
 65    for output in entry["outputs"]:
 66        i += 1
 67        # There are a few cases where we short circuit and append
 68        # in special cases, like Json types. We could use this for better
 69        # tensor field handling as well in the future.
 70        append_output = True
 71        try:
 72            first_output = entry["outputs"][i]
 73            first_type = next(iter(first_output.keys()))
 74        except Exception:
 75            raise RuntimeError(
 76                f"Missing output decoding inference response. entry: {entry}"
 77            )
 78
 79        output = None
 80        known_types = {
 81            "Compact",
 82            "Half",
 83            "Float",
 84            "Double",
 85            "Uint8",
 86            "Int8",
 87            "Uint16",
 88            "Int16",
 89            "Uint32",
 90            "Int32",
 91            "Uint64",
 92            "Int64",
 93            "Complex64",
 94            "Complex128",
 95            "Bool",
 96            "String",
 97        }
 98
 99        if first_type in known_types:
100            output = first_output[first_type]
101            output["dtype"] = first_type
102        elif first_type == "Json":
103            # If we can identify the tensor field, chase that down and return it
104            try:
105
106                if "tensor_fields" in first_output[first_type]["data"][0]:
107                    field = first_output[first_type]["data"][0]["tensor_fields"][
108                        0
109                    ]  # 'tensor'
110                    key = next(
111                        iter(
112                            first_output[first_type]["data"][0][field]["outputs"][
113                                0
114                            ].keys()
115                        )
116                    )  # Float
117                    output = first_output[first_type]["data"][0][field]["outputs"][0][
118                        key
119                    ]
120                    output["dtype"] = key
121                # TODO: when Arrow changes are merged in, think of a better way of handling these edge or special cases.
122                elif "tensor" in first_output[first_type]["data"][0]:
123                    key = next(
124                        iter(first_output[first_type]["data"][0]["tensor"][0].keys())
125                    )  # Float
126                    output = first_output[first_type]["data"][0]["tensor"][0][key]
127                    output["dtype"] = key
128
129            except Exception:
130                pass
131
132            if output is None:
133                # No tensor field, try to parse keys in Json
134                try:
135                    nps = first_output[first_type]["data"][0]
136                    dtype = None
137                    for key, item in nps.items():
138                        if key == "original":
139                            dtype = next(iter(nps["original"]["outputs"][0].keys()))
140                        else:
141                            append_output = False
142                            if isinstance(item, list) or np.isscalar(item):
143                                data: Dict[str, Any] = {
144                                    "dim": None,
145                                    "data": item,
146                                    "dtype": dtype,
147                                }
148                                outputs.append(data)
149                            elif (
150                                isinstance(item, dict)
151                                and "dim" in item
152                                and "data" in item
153                            ):
154                                item["dtype"] = dtype
155                                outputs.append(item)
156                            else:
157                                raise RuntimeError(
158                                    "Error parsing Json results for data. Supported ouput types are list, scalars for the data method, and dictionaries with a 'dim' key and a 'data' key"
159                                )
160
161                except Exception as ex:
162                    raise RuntimeError(
163                        f"Unexpected format decoding inference response. data: {entry} error: {ex}"
164                    )
165        else:
166            raise RuntimeError(
167                f"Unsupported type '{first_type}' in inference response. entry: {entry}"
168            )
169        # Post checks to make sure it looks like numpy
170        if append_output and "dim" in output.keys() and "data" in output.keys():
171            outputs.append(output)
172        elif append_output:
173            raise RuntimeError(f"Inference output does not look like numpy: {output}")
174
175    return outputs
176
177
178def flatten_tensor(prefix: str, numeric_list: list) -> Dict[str, Number]:
179    """Converts a possibly multidimentionsl list of numbers into a
180    dict where each item in the list is represented by a key value pair
181    in the dict. Does not maintain dimensions since dataframes are 2d.
182    Does not maintain/manage types since it should work for any type supported
183    by numpy.
184
185    For example
186    [1,2,3] => {prefix_0: 1, prefix_1: 2, prefix_2: 3}.
187    [[1,2],[3,4]] => {prefix_0_0: 1, prefix_0_1: 2, prefix_1_0: 3, prefix_1_1: 4}
188    """
189    output_dict = {}
190    a = np.array(numeric_list).ravel()
191    if not prefix.endswith("_"):
192        prefix = prefix + "_"
193    for i, v in enumerate(a):
194        name = f"{prefix}{i}"
195        output_dict[name] = v
196    return output_dict
197
198
199def flatten_dict(prefix: str, input_dict: Dict) -> Dict[str, Any]:
200    """Recursively flattens the input dict, setting the values on the output dict.
201    Assumes simple value types (str, numbers, dicts, and lists).
202    If a value is a dict it is flattened recursively.
203    If a value is a list each item is set as a new k, v pair.
204    """
205
206    output_dict = {}
207    for k, v in input_dict.items():
208        name = f"{prefix}{k}"
209        if type(v) == list:
210            if len(v) > 0:
211                v_type = type(v[0])
212                if v_type == str:
213                    for i, item in enumerate(v):
214                        output_dict[f"{name}_{i}"] = item
215                elif v_type in [float, int, bool]:
216                    output_dict.update(flatten_tensor(name + "_", v))
217                else:
218                    # Things like check_failures have nested structs
219                    output_dict[name] = str(v)
220                    # raise TypeError(f"Can't handle type {v_type} for key '{k}'")
221            else:
222                output_dict[name] = None
223        elif type(v) == dict:
224            output_dict.update(flatten_dict(name + "_", v))
225        else:
226            output_dict[name] = v
227    return output_dict
228
229
230def inference_logs_to_dataframe(logs: List[Dict[str, Any]]) -> pd.DataFrame:
231    """Very similar to dict_list_to_dataframe but specific to inference
232    logs since they have input and output heiararchical fields/structures
233    that must be treated in particular ways."""
234
235    def flatten_inputs(inputs: Dict[str, Any]):
236        """Inputs/original_data is a dict of string to values of multi_dimensional
237        list and need to have their own numbering to fit that. We should use the
238        input name but for consistency we'll use 'input' (since the name is not
239        available elsewhere)."""
240        fo = {}
241        for i, (_k, v) in enumerate(inputs.items()):
242            # One day we may be able to use the tensor name
243            # But for now we don't have or use that in the
244            # assays so use 'input'
245            fo.update(flatten_tensor(f"input_{i}", v))
246        return fo
247
248    def flatten_outputs(outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Any]:
249        """The key piece of info we want from outputs is the nested 'data' field
250        which could be a multi-dimensional list. This function pulls that out and
251        numbers/names it appropriately."""
252        fo = {}
253        for i, output_dict in enumerate(outputs):  # loop through each output
254            for _k, ov in output_dict.items():  # get 'data' for each output
255                data = ov["data"]
256                fo.update(flatten_tensor(f"output_{i}", data))
257
258        return fo
259
260    def process_inference_record(temp_log) -> Dict[str, Any]:
261        """Manipulate an inference record dict and flatten it."""
262        # Copy the log so we don't change the original
263        temp_log = temp_log.copy()
264
265        # Process and delete the inputs and outputs
266        inputs = temp_log["original_data"]
267        input_dict = flatten_inputs(inputs)
268        del temp_log["original_data"]
269
270        outputs = temp_log["outputs"]
271        output_dict = flatten_outputs(outputs)
272        del temp_log["outputs"]
273
274        # Flatten the temp log (should be straightforward)
275        # and add in the input and output dicts
276        output_log = flatten_dict("", temp_log)
277        output_log.update(input_dict)
278        output_log.update(output_dict)
279
280        return output_log
281
282    processed_logs = [process_inference_record(log) for log in logs]
283
284    df = pd.DataFrame(processed_logs)
285    return df
286
287
288def dict_list_to_dataframe(assay_results: List[Dict[str, Any]]) -> pd.DataFrame:
289    """Primarily for assay result lists but can be used for any list of simple
290    dicts."""
291    res = [flatten_dict("", r) for r in assay_results]
292
293    df = pd.DataFrame(res)
294    return df
def convert_to_np_dtype(dtype):
 9def convert_to_np_dtype(dtype):
10    if dtype == "Bool":
11        return np.bool_
12    elif dtype == "Int8":
13        return np.int8
14    elif dtype == "Int16":
15        return np.int16
16    elif dtype == "Int32":
17        return np.int32
18    elif dtype == "Int64":
19        return np.int64
20    elif dtype == "Uint8":
21        return np.uint8
22    elif dtype == "Uint16":
23        return np.uint16
24    elif dtype == "Uint32":
25        return np.uint32
26    elif dtype == "Uint64":
27        return np.uint64
28    elif dtype == "Half":
29        return np.float16
30    elif dtype == "Float" or "Double":
31        return np.float64
32    elif dtype == "Complex64":
33        return np.complex64
34    elif dtype == "Complex128":
35        return np.complex128
36    elif dtype == "BYTES":
37        return np.bytes_
38    return None
def to_nd_array_list(outputs: List[Dict[str, Any]]) -> List[numpy.ndarray]:
41def to_nd_array_list(outputs: List[Dict[str, Any]]) -> List[np.ndarray]:
42    res: List[np.ndarray] = []
43    for output in outputs:
44        if "dim" in output and output["dim"] is not None:
45            res.append(
46                np.ndarray(
47                    shape=tuple(output["dim"]),
48                    buffer=np.array(output["data"]),
49                    dtype=convert_to_np_dtype(output["dtype"]),
50                )
51            )
52        else:
53            res.append(np.array(output["data"]))
54    return res
def decode_inference_result(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
 57def decode_inference_result(entry: Dict[str, Any]) -> List[Dict[str, Any]]:
 58    """
 59    Decode inference results. Since they have a potentially rich structure, this could become a substantial effort in the future.
 60
 61    TODO: Support multiple outputs
 62    TODO: Support multiple data types
 63    """
 64    outputs = []
 65    i = -1
 66    for output in entry["outputs"]:
 67        i += 1
 68        # There are a few cases where we short circuit and append
 69        # in special cases, like Json types. We could use this for better
 70        # tensor field handling as well in the future.
 71        append_output = True
 72        try:
 73            first_output = entry["outputs"][i]
 74            first_type = next(iter(first_output.keys()))
 75        except Exception:
 76            raise RuntimeError(
 77                f"Missing output decoding inference response. entry: {entry}"
 78            )
 79
 80        output = None
 81        known_types = {
 82            "Compact",
 83            "Half",
 84            "Float",
 85            "Double",
 86            "Uint8",
 87            "Int8",
 88            "Uint16",
 89            "Int16",
 90            "Uint32",
 91            "Int32",
 92            "Uint64",
 93            "Int64",
 94            "Complex64",
 95            "Complex128",
 96            "Bool",
 97            "String",
 98        }
 99
100        if first_type in known_types:
101            output = first_output[first_type]
102            output["dtype"] = first_type
103        elif first_type == "Json":
104            # If we can identify the tensor field, chase that down and return it
105            try:
106
107                if "tensor_fields" in first_output[first_type]["data"][0]:
108                    field = first_output[first_type]["data"][0]["tensor_fields"][
109                        0
110                    ]  # 'tensor'
111                    key = next(
112                        iter(
113                            first_output[first_type]["data"][0][field]["outputs"][
114                                0
115                            ].keys()
116                        )
117                    )  # Float
118                    output = first_output[first_type]["data"][0][field]["outputs"][0][
119                        key
120                    ]
121                    output["dtype"] = key
122                # TODO: when Arrow changes are merged in, think of a better way of handling these edge or special cases.
123                elif "tensor" in first_output[first_type]["data"][0]:
124                    key = next(
125                        iter(first_output[first_type]["data"][0]["tensor"][0].keys())
126                    )  # Float
127                    output = first_output[first_type]["data"][0]["tensor"][0][key]
128                    output["dtype"] = key
129
130            except Exception:
131                pass
132
133            if output is None:
134                # No tensor field, try to parse keys in Json
135                try:
136                    nps = first_output[first_type]["data"][0]
137                    dtype = None
138                    for key, item in nps.items():
139                        if key == "original":
140                            dtype = next(iter(nps["original"]["outputs"][0].keys()))
141                        else:
142                            append_output = False
143                            if isinstance(item, list) or np.isscalar(item):
144                                data: Dict[str, Any] = {
145                                    "dim": None,
146                                    "data": item,
147                                    "dtype": dtype,
148                                }
149                                outputs.append(data)
150                            elif (
151                                isinstance(item, dict)
152                                and "dim" in item
153                                and "data" in item
154                            ):
155                                item["dtype"] = dtype
156                                outputs.append(item)
157                            else:
158                                raise RuntimeError(
159                                    "Error parsing Json results for data. Supported ouput types are list, scalars for the data method, and dictionaries with a 'dim' key and a 'data' key"
160                                )
161
162                except Exception as ex:
163                    raise RuntimeError(
164                        f"Unexpected format decoding inference response. data: {entry} error: {ex}"
165                    )
166        else:
167            raise RuntimeError(
168                f"Unsupported type '{first_type}' in inference response. entry: {entry}"
169            )
170        # Post checks to make sure it looks like numpy
171        if append_output and "dim" in output.keys() and "data" in output.keys():
172            outputs.append(output)
173        elif append_output:
174            raise RuntimeError(f"Inference output does not look like numpy: {output}")
175
176    return outputs

Decode inference results. Since they have a potentially rich structure, this could become a substantial effort in the future.

TODO: Support multiple outputs TODO: Support multiple data types

def flatten_tensor(prefix: str, numeric_list: list) -> Dict[str, numbers.Number]:
179def flatten_tensor(prefix: str, numeric_list: list) -> Dict[str, Number]:
180    """Converts a possibly multidimentionsl list of numbers into a
181    dict where each item in the list is represented by a key value pair
182    in the dict. Does not maintain dimensions since dataframes are 2d.
183    Does not maintain/manage types since it should work for any type supported
184    by numpy.
185
186    For example
187    [1,2,3] => {prefix_0: 1, prefix_1: 2, prefix_2: 3}.
188    [[1,2],[3,4]] => {prefix_0_0: 1, prefix_0_1: 2, prefix_1_0: 3, prefix_1_1: 4}
189    """
190    output_dict = {}
191    a = np.array(numeric_list).ravel()
192    if not prefix.endswith("_"):
193        prefix = prefix + "_"
194    for i, v in enumerate(a):
195        name = f"{prefix}{i}"
196        output_dict[name] = v
197    return output_dict

Converts a possibly multidimentionsl list of numbers into a dict where each item in the list is represented by a key value pair in the dict. Does not maintain dimensions since dataframes are 2d. Does not maintain/manage types since it should work for any type supported by numpy.

For example [1,2,3] => {prefix_0: 1, prefix_1: 2, prefix_2: 3}. [[1,2],[3,4]] => {prefix_0_0: 1, prefix_0_1: 2, prefix_1_0: 3, prefix_1_1: 4}

def flatten_dict(prefix: str, input_dict: Dict) -> Dict[str, Any]:
200def flatten_dict(prefix: str, input_dict: Dict) -> Dict[str, Any]:
201    """Recursively flattens the input dict, setting the values on the output dict.
202    Assumes simple value types (str, numbers, dicts, and lists).
203    If a value is a dict it is flattened recursively.
204    If a value is a list each item is set as a new k, v pair.
205    """
206
207    output_dict = {}
208    for k, v in input_dict.items():
209        name = f"{prefix}{k}"
210        if type(v) == list:
211            if len(v) > 0:
212                v_type = type(v[0])
213                if v_type == str:
214                    for i, item in enumerate(v):
215                        output_dict[f"{name}_{i}"] = item
216                elif v_type in [float, int, bool]:
217                    output_dict.update(flatten_tensor(name + "_", v))
218                else:
219                    # Things like check_failures have nested structs
220                    output_dict[name] = str(v)
221                    # raise TypeError(f"Can't handle type {v_type} for key '{k}'")
222            else:
223                output_dict[name] = None
224        elif type(v) == dict:
225            output_dict.update(flatten_dict(name + "_", v))
226        else:
227            output_dict[name] = v
228    return output_dict

Recursively flattens the input dict, setting the values on the output dict. Assumes simple value types (str, numbers, dicts, and lists). If a value is a dict it is flattened recursively. If a value is a list each item is set as a new k, v pair.

def inference_logs_to_dataframe(logs: List[Dict[str, Any]]) -> pandas.core.frame.DataFrame:
231def inference_logs_to_dataframe(logs: List[Dict[str, Any]]) -> pd.DataFrame:
232    """Very similar to dict_list_to_dataframe but specific to inference
233    logs since they have input and output heiararchical fields/structures
234    that must be treated in particular ways."""
235
236    def flatten_inputs(inputs: Dict[str, Any]):
237        """Inputs/original_data is a dict of string to values of multi_dimensional
238        list and need to have their own numbering to fit that. We should use the
239        input name but for consistency we'll use 'input' (since the name is not
240        available elsewhere)."""
241        fo = {}
242        for i, (_k, v) in enumerate(inputs.items()):
243            # One day we may be able to use the tensor name
244            # But for now we don't have or use that in the
245            # assays so use 'input'
246            fo.update(flatten_tensor(f"input_{i}", v))
247        return fo
248
249    def flatten_outputs(outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Any]:
250        """The key piece of info we want from outputs is the nested 'data' field
251        which could be a multi-dimensional list. This function pulls that out and
252        numbers/names it appropriately."""
253        fo = {}
254        for i, output_dict in enumerate(outputs):  # loop through each output
255            for _k, ov in output_dict.items():  # get 'data' for each output
256                data = ov["data"]
257                fo.update(flatten_tensor(f"output_{i}", data))
258
259        return fo
260
261    def process_inference_record(temp_log) -> Dict[str, Any]:
262        """Manipulate an inference record dict and flatten it."""
263        # Copy the log so we don't change the original
264        temp_log = temp_log.copy()
265
266        # Process and delete the inputs and outputs
267        inputs = temp_log["original_data"]
268        input_dict = flatten_inputs(inputs)
269        del temp_log["original_data"]
270
271        outputs = temp_log["outputs"]
272        output_dict = flatten_outputs(outputs)
273        del temp_log["outputs"]
274
275        # Flatten the temp log (should be straightforward)
276        # and add in the input and output dicts
277        output_log = flatten_dict("", temp_log)
278        output_log.update(input_dict)
279        output_log.update(output_dict)
280
281        return output_log
282
283    processed_logs = [process_inference_record(log) for log in logs]
284
285    df = pd.DataFrame(processed_logs)
286    return df

Very similar to dict_list_to_dataframe but specific to inference logs since they have input and output heiararchical fields/structures that must be treated in particular ways.

def dict_list_to_dataframe(assay_results: List[Dict[str, Any]]) -> pandas.core.frame.DataFrame:
289def dict_list_to_dataframe(assay_results: List[Dict[str, Any]]) -> pd.DataFrame:
290    """Primarily for assay result lists but can be used for any list of simple
291    dicts."""
292    res = [flatten_dict("", r) for r in assay_results]
293
294    df = pd.DataFrame(res)
295    return df

Primarily for assay result lists but can be used for any list of simple dicts.