wallaroo.inference_decode
1from numbers import Number 2from typing import Any, Dict, List 3 4import numpy as np 5import pandas as pd 6 7 8def convert_to_np_dtype(dtype): 9 if dtype == "Bool": 10 return np.bool_ 11 elif dtype == "Int8": 12 return np.int8 13 elif dtype == "Int16": 14 return np.int16 15 elif dtype == "Int32": 16 return np.int32 17 elif dtype == "Int64": 18 return np.int64 19 elif dtype == "Uint8": 20 return np.uint8 21 elif dtype == "Uint16": 22 return np.uint16 23 elif dtype == "Uint32": 24 return np.uint32 25 elif dtype == "Uint64": 26 return np.uint64 27 elif dtype == "Half": 28 return np.float16 29 elif dtype == "Float" or "Double": 30 return np.float64 31 elif dtype == "Complex64": 32 return np.complex64 33 elif dtype == "Complex128": 34 return np.complex128 35 elif dtype == "BYTES": 36 return np.bytes_ 37 return None 38 39 40def to_nd_array_list(outputs: List[Dict[str, Any]]) -> List[np.ndarray]: 41 res: List[np.ndarray] = [] 42 for output in outputs: 43 if "dim" in output and output["dim"] is not None: 44 res.append( 45 np.ndarray( 46 shape=tuple(output["dim"]), 47 buffer=np.array(output["data"]), 48 dtype=convert_to_np_dtype(output["dtype"]), 49 ) 50 ) 51 else: 52 res.append(np.array(output["data"])) 53 return res 54 55 56def decode_inference_result(entry: Dict[str, Any]) -> List[Dict[str, Any]]: 57 """ 58 Decode inference results. Since they have a potentially rich structure, this could become a substantial effort in the future. 59 60 TODO: Support multiple outputs 61 TODO: Support multiple data types 62 """ 63 outputs = [] 64 i = -1 65 for output in entry["outputs"]: 66 i += 1 67 # There are a few cases where we short circuit and append 68 # in special cases, like Json types. We could use this for better 69 # tensor field handling as well in the future. 70 append_output = True 71 try: 72 first_output = entry["outputs"][i] 73 first_type = next(iter(first_output.keys())) 74 except Exception: 75 raise RuntimeError( 76 f"Missing output decoding inference response. entry: {entry}" 77 ) 78 79 output = None 80 known_types = { 81 "Compact", 82 "Half", 83 "Float", 84 "Double", 85 "Uint8", 86 "Int8", 87 "Uint16", 88 "Int16", 89 "Uint32", 90 "Int32", 91 "Uint64", 92 "Int64", 93 "Complex64", 94 "Complex128", 95 "Bool", 96 "String", 97 } 98 99 if first_type in known_types: 100 output = first_output[first_type] 101 output["dtype"] = first_type 102 elif first_type == "Json": 103 # If we can identify the tensor field, chase that down and return it 104 try: 105 106 if "tensor_fields" in first_output[first_type]["data"][0]: 107 field = first_output[first_type]["data"][0]["tensor_fields"][ 108 0 109 ] # 'tensor' 110 key = next( 111 iter( 112 first_output[first_type]["data"][0][field]["outputs"][ 113 0 114 ].keys() 115 ) 116 ) # Float 117 output = first_output[first_type]["data"][0][field]["outputs"][0][ 118 key 119 ] 120 output["dtype"] = key 121 # TODO: when Arrow changes are merged in, think of a better way of handling these edge or special cases. 122 elif "tensor" in first_output[first_type]["data"][0]: 123 key = next( 124 iter(first_output[first_type]["data"][0]["tensor"][0].keys()) 125 ) # Float 126 output = first_output[first_type]["data"][0]["tensor"][0][key] 127 output["dtype"] = key 128 129 except Exception: 130 pass 131 132 if output is None: 133 # No tensor field, try to parse keys in Json 134 try: 135 nps = first_output[first_type]["data"][0] 136 dtype = None 137 for key, item in nps.items(): 138 if key == "original": 139 dtype = next(iter(nps["original"]["outputs"][0].keys())) 140 else: 141 append_output = False 142 if isinstance(item, list) or np.isscalar(item): 143 data: Dict[str, Any] = { 144 "dim": None, 145 "data": item, 146 "dtype": dtype, 147 } 148 outputs.append(data) 149 elif ( 150 isinstance(item, dict) 151 and "dim" in item 152 and "data" in item 153 ): 154 item["dtype"] = dtype 155 outputs.append(item) 156 else: 157 raise RuntimeError( 158 "Error parsing Json results for data. Supported ouput types are list, scalars for the data method, and dictionaries with a 'dim' key and a 'data' key" 159 ) 160 161 except Exception as ex: 162 raise RuntimeError( 163 f"Unexpected format decoding inference response. data: {entry} error: {ex}" 164 ) 165 else: 166 raise RuntimeError( 167 f"Unsupported type '{first_type}' in inference response. entry: {entry}" 168 ) 169 # Post checks to make sure it looks like numpy 170 if append_output and "dim" in output.keys() and "data" in output.keys(): 171 outputs.append(output) 172 elif append_output: 173 raise RuntimeError(f"Inference output does not look like numpy: {output}") 174 175 return outputs 176 177 178def flatten_tensor(prefix: str, numeric_list: list) -> Dict[str, Number]: 179 """Converts a possibly multidimentionsl list of numbers into a 180 dict where each item in the list is represented by a key value pair 181 in the dict. Does not maintain dimensions since dataframes are 2d. 182 Does not maintain/manage types since it should work for any type supported 183 by numpy. 184 185 For example 186 [1,2,3] => {prefix_0: 1, prefix_1: 2, prefix_2: 3}. 187 [[1,2],[3,4]] => {prefix_0_0: 1, prefix_0_1: 2, prefix_1_0: 3, prefix_1_1: 4} 188 """ 189 output_dict = {} 190 a = np.array(numeric_list).ravel() 191 if not prefix.endswith("_"): 192 prefix = prefix + "_" 193 for i, v in enumerate(a): 194 name = f"{prefix}{i}" 195 output_dict[name] = v 196 return output_dict 197 198 199def flatten_dict(prefix: str, input_dict: Dict) -> Dict[str, Any]: 200 """Recursively flattens the input dict, setting the values on the output dict. 201 Assumes simple value types (str, numbers, dicts, and lists). 202 If a value is a dict it is flattened recursively. 203 If a value is a list each item is set as a new k, v pair. 204 """ 205 206 output_dict = {} 207 for k, v in input_dict.items(): 208 name = f"{prefix}{k}" 209 if type(v) == list: 210 if len(v) > 0: 211 v_type = type(v[0]) 212 if v_type == str: 213 for i, item in enumerate(v): 214 output_dict[f"{name}_{i}"] = item 215 elif v_type in [float, int, bool]: 216 output_dict.update(flatten_tensor(name + "_", v)) 217 else: 218 # Things like check_failures have nested structs 219 output_dict[name] = str(v) 220 # raise TypeError(f"Can't handle type {v_type} for key '{k}'") 221 else: 222 output_dict[name] = None 223 elif type(v) == dict: 224 output_dict.update(flatten_dict(name + "_", v)) 225 else: 226 output_dict[name] = v 227 return output_dict 228 229 230def inference_logs_to_dataframe(logs: List[Dict[str, Any]]) -> pd.DataFrame: 231 """Very similar to dict_list_to_dataframe but specific to inference 232 logs since they have input and output heiararchical fields/structures 233 that must be treated in particular ways.""" 234 235 def flatten_inputs(inputs: Dict[str, Any]): 236 """Inputs/original_data is a dict of string to values of multi_dimensional 237 list and need to have their own numbering to fit that. We should use the 238 input name but for consistency we'll use 'input' (since the name is not 239 available elsewhere).""" 240 fo = {} 241 for i, (_k, v) in enumerate(inputs.items()): 242 # One day we may be able to use the tensor name 243 # But for now we don't have or use that in the 244 # assays so use 'input' 245 fo.update(flatten_tensor(f"input_{i}", v)) 246 return fo 247 248 def flatten_outputs(outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Any]: 249 """The key piece of info we want from outputs is the nested 'data' field 250 which could be a multi-dimensional list. This function pulls that out and 251 numbers/names it appropriately.""" 252 fo = {} 253 for i, output_dict in enumerate(outputs): # loop through each output 254 for _k, ov in output_dict.items(): # get 'data' for each output 255 data = ov["data"] 256 fo.update(flatten_tensor(f"output_{i}", data)) 257 258 return fo 259 260 def process_inference_record(temp_log) -> Dict[str, Any]: 261 """Manipulate an inference record dict and flatten it.""" 262 # Copy the log so we don't change the original 263 temp_log = temp_log.copy() 264 265 # Process and delete the inputs and outputs 266 inputs = temp_log["original_data"] 267 input_dict = flatten_inputs(inputs) 268 del temp_log["original_data"] 269 270 outputs = temp_log["outputs"] 271 output_dict = flatten_outputs(outputs) 272 del temp_log["outputs"] 273 274 # Flatten the temp log (should be straightforward) 275 # and add in the input and output dicts 276 output_log = flatten_dict("", temp_log) 277 output_log.update(input_dict) 278 output_log.update(output_dict) 279 280 return output_log 281 282 processed_logs = [process_inference_record(log) for log in logs] 283 284 df = pd.DataFrame(processed_logs) 285 return df 286 287 288def dict_list_to_dataframe(assay_results: List[Dict[str, Any]]) -> pd.DataFrame: 289 """Primarily for assay result lists but can be used for any list of simple 290 dicts.""" 291 res = [flatten_dict("", r) for r in assay_results] 292 293 df = pd.DataFrame(res) 294 return df
9def convert_to_np_dtype(dtype): 10 if dtype == "Bool": 11 return np.bool_ 12 elif dtype == "Int8": 13 return np.int8 14 elif dtype == "Int16": 15 return np.int16 16 elif dtype == "Int32": 17 return np.int32 18 elif dtype == "Int64": 19 return np.int64 20 elif dtype == "Uint8": 21 return np.uint8 22 elif dtype == "Uint16": 23 return np.uint16 24 elif dtype == "Uint32": 25 return np.uint32 26 elif dtype == "Uint64": 27 return np.uint64 28 elif dtype == "Half": 29 return np.float16 30 elif dtype == "Float" or "Double": 31 return np.float64 32 elif dtype == "Complex64": 33 return np.complex64 34 elif dtype == "Complex128": 35 return np.complex128 36 elif dtype == "BYTES": 37 return np.bytes_ 38 return None
41def to_nd_array_list(outputs: List[Dict[str, Any]]) -> List[np.ndarray]: 42 res: List[np.ndarray] = [] 43 for output in outputs: 44 if "dim" in output and output["dim"] is not None: 45 res.append( 46 np.ndarray( 47 shape=tuple(output["dim"]), 48 buffer=np.array(output["data"]), 49 dtype=convert_to_np_dtype(output["dtype"]), 50 ) 51 ) 52 else: 53 res.append(np.array(output["data"])) 54 return res
57def decode_inference_result(entry: Dict[str, Any]) -> List[Dict[str, Any]]: 58 """ 59 Decode inference results. Since they have a potentially rich structure, this could become a substantial effort in the future. 60 61 TODO: Support multiple outputs 62 TODO: Support multiple data types 63 """ 64 outputs = [] 65 i = -1 66 for output in entry["outputs"]: 67 i += 1 68 # There are a few cases where we short circuit and append 69 # in special cases, like Json types. We could use this for better 70 # tensor field handling as well in the future. 71 append_output = True 72 try: 73 first_output = entry["outputs"][i] 74 first_type = next(iter(first_output.keys())) 75 except Exception: 76 raise RuntimeError( 77 f"Missing output decoding inference response. entry: {entry}" 78 ) 79 80 output = None 81 known_types = { 82 "Compact", 83 "Half", 84 "Float", 85 "Double", 86 "Uint8", 87 "Int8", 88 "Uint16", 89 "Int16", 90 "Uint32", 91 "Int32", 92 "Uint64", 93 "Int64", 94 "Complex64", 95 "Complex128", 96 "Bool", 97 "String", 98 } 99 100 if first_type in known_types: 101 output = first_output[first_type] 102 output["dtype"] = first_type 103 elif first_type == "Json": 104 # If we can identify the tensor field, chase that down and return it 105 try: 106 107 if "tensor_fields" in first_output[first_type]["data"][0]: 108 field = first_output[first_type]["data"][0]["tensor_fields"][ 109 0 110 ] # 'tensor' 111 key = next( 112 iter( 113 first_output[first_type]["data"][0][field]["outputs"][ 114 0 115 ].keys() 116 ) 117 ) # Float 118 output = first_output[first_type]["data"][0][field]["outputs"][0][ 119 key 120 ] 121 output["dtype"] = key 122 # TODO: when Arrow changes are merged in, think of a better way of handling these edge or special cases. 123 elif "tensor" in first_output[first_type]["data"][0]: 124 key = next( 125 iter(first_output[first_type]["data"][0]["tensor"][0].keys()) 126 ) # Float 127 output = first_output[first_type]["data"][0]["tensor"][0][key] 128 output["dtype"] = key 129 130 except Exception: 131 pass 132 133 if output is None: 134 # No tensor field, try to parse keys in Json 135 try: 136 nps = first_output[first_type]["data"][0] 137 dtype = None 138 for key, item in nps.items(): 139 if key == "original": 140 dtype = next(iter(nps["original"]["outputs"][0].keys())) 141 else: 142 append_output = False 143 if isinstance(item, list) or np.isscalar(item): 144 data: Dict[str, Any] = { 145 "dim": None, 146 "data": item, 147 "dtype": dtype, 148 } 149 outputs.append(data) 150 elif ( 151 isinstance(item, dict) 152 and "dim" in item 153 and "data" in item 154 ): 155 item["dtype"] = dtype 156 outputs.append(item) 157 else: 158 raise RuntimeError( 159 "Error parsing Json results for data. Supported ouput types are list, scalars for the data method, and dictionaries with a 'dim' key and a 'data' key" 160 ) 161 162 except Exception as ex: 163 raise RuntimeError( 164 f"Unexpected format decoding inference response. data: {entry} error: {ex}" 165 ) 166 else: 167 raise RuntimeError( 168 f"Unsupported type '{first_type}' in inference response. entry: {entry}" 169 ) 170 # Post checks to make sure it looks like numpy 171 if append_output and "dim" in output.keys() and "data" in output.keys(): 172 outputs.append(output) 173 elif append_output: 174 raise RuntimeError(f"Inference output does not look like numpy: {output}") 175 176 return outputs
Decode inference results. Since they have a potentially rich structure, this could become a substantial effort in the future.
TODO: Support multiple outputs TODO: Support multiple data types
179def flatten_tensor(prefix: str, numeric_list: list) -> Dict[str, Number]: 180 """Converts a possibly multidimentionsl list of numbers into a 181 dict where each item in the list is represented by a key value pair 182 in the dict. Does not maintain dimensions since dataframes are 2d. 183 Does not maintain/manage types since it should work for any type supported 184 by numpy. 185 186 For example 187 [1,2,3] => {prefix_0: 1, prefix_1: 2, prefix_2: 3}. 188 [[1,2],[3,4]] => {prefix_0_0: 1, prefix_0_1: 2, prefix_1_0: 3, prefix_1_1: 4} 189 """ 190 output_dict = {} 191 a = np.array(numeric_list).ravel() 192 if not prefix.endswith("_"): 193 prefix = prefix + "_" 194 for i, v in enumerate(a): 195 name = f"{prefix}{i}" 196 output_dict[name] = v 197 return output_dict
Converts a possibly multidimentionsl list of numbers into a dict where each item in the list is represented by a key value pair in the dict. Does not maintain dimensions since dataframes are 2d. Does not maintain/manage types since it should work for any type supported by numpy.
For example [1,2,3] => {prefix_0: 1, prefix_1: 2, prefix_2: 3}. [[1,2],[3,4]] => {prefix_0_0: 1, prefix_0_1: 2, prefix_1_0: 3, prefix_1_1: 4}
200def flatten_dict(prefix: str, input_dict: Dict) -> Dict[str, Any]: 201 """Recursively flattens the input dict, setting the values on the output dict. 202 Assumes simple value types (str, numbers, dicts, and lists). 203 If a value is a dict it is flattened recursively. 204 If a value is a list each item is set as a new k, v pair. 205 """ 206 207 output_dict = {} 208 for k, v in input_dict.items(): 209 name = f"{prefix}{k}" 210 if type(v) == list: 211 if len(v) > 0: 212 v_type = type(v[0]) 213 if v_type == str: 214 for i, item in enumerate(v): 215 output_dict[f"{name}_{i}"] = item 216 elif v_type in [float, int, bool]: 217 output_dict.update(flatten_tensor(name + "_", v)) 218 else: 219 # Things like check_failures have nested structs 220 output_dict[name] = str(v) 221 # raise TypeError(f"Can't handle type {v_type} for key '{k}'") 222 else: 223 output_dict[name] = None 224 elif type(v) == dict: 225 output_dict.update(flatten_dict(name + "_", v)) 226 else: 227 output_dict[name] = v 228 return output_dict
Recursively flattens the input dict, setting the values on the output dict. Assumes simple value types (str, numbers, dicts, and lists). If a value is a dict it is flattened recursively. If a value is a list each item is set as a new k, v pair.
231def inference_logs_to_dataframe(logs: List[Dict[str, Any]]) -> pd.DataFrame: 232 """Very similar to dict_list_to_dataframe but specific to inference 233 logs since they have input and output heiararchical fields/structures 234 that must be treated in particular ways.""" 235 236 def flatten_inputs(inputs: Dict[str, Any]): 237 """Inputs/original_data is a dict of string to values of multi_dimensional 238 list and need to have their own numbering to fit that. We should use the 239 input name but for consistency we'll use 'input' (since the name is not 240 available elsewhere).""" 241 fo = {} 242 for i, (_k, v) in enumerate(inputs.items()): 243 # One day we may be able to use the tensor name 244 # But for now we don't have or use that in the 245 # assays so use 'input' 246 fo.update(flatten_tensor(f"input_{i}", v)) 247 return fo 248 249 def flatten_outputs(outputs: List[Dict[str, Dict[str, Any]]]) -> Dict[str, Any]: 250 """The key piece of info we want from outputs is the nested 'data' field 251 which could be a multi-dimensional list. This function pulls that out and 252 numbers/names it appropriately.""" 253 fo = {} 254 for i, output_dict in enumerate(outputs): # loop through each output 255 for _k, ov in output_dict.items(): # get 'data' for each output 256 data = ov["data"] 257 fo.update(flatten_tensor(f"output_{i}", data)) 258 259 return fo 260 261 def process_inference_record(temp_log) -> Dict[str, Any]: 262 """Manipulate an inference record dict and flatten it.""" 263 # Copy the log so we don't change the original 264 temp_log = temp_log.copy() 265 266 # Process and delete the inputs and outputs 267 inputs = temp_log["original_data"] 268 input_dict = flatten_inputs(inputs) 269 del temp_log["original_data"] 270 271 outputs = temp_log["outputs"] 272 output_dict = flatten_outputs(outputs) 273 del temp_log["outputs"] 274 275 # Flatten the temp log (should be straightforward) 276 # and add in the input and output dicts 277 output_log = flatten_dict("", temp_log) 278 output_log.update(input_dict) 279 output_log.update(output_dict) 280 281 return output_log 282 283 processed_logs = [process_inference_record(log) for log in logs] 284 285 df = pd.DataFrame(processed_logs) 286 return df
Very similar to dict_list_to_dataframe but specific to inference logs since they have input and output heiararchical fields/structures that must be treated in particular ways.
289def dict_list_to_dataframe(assay_results: List[Dict[str, Any]]) -> pd.DataFrame: 290 """Primarily for assay result lists but can be used for any list of simple 291 dicts.""" 292 res = [flatten_dict("", r) for r in assay_results] 293 294 df = pd.DataFrame(res) 295 return df
Primarily for assay result lists but can be used for any list of simple dicts.