muutils.json_serialize.array
this utilities module handles serialization and loading of numpy and torch arrays as json
array_list_meta
is less efficient (arrays are stored as nested lists), but preserves both metadata and human readability.array_b64_meta
is the most efficient, but is not human readable.external
is mostly for use inZANJ
1"""this utilities module handles serialization and loading of numpy and torch arrays as json 2 3- `array_list_meta` is less efficient (arrays are stored as nested lists), but preserves both metadata and human readability. 4- `array_b64_meta` is the most efficient, but is not human readable. 5- `external` is mostly for use in [`ZANJ`](https://github.com/mivanit/ZANJ) 6 7""" 8 9from __future__ import annotations 10 11import base64 12import typing 13import warnings 14from typing import Any, Iterable, Literal, Optional, Sequence 15 16try: 17 import numpy as np 18except ImportError as e: 19 warnings.warn( 20 f"numpy is not installed, array serialization will not work: \n{e}", 21 ImportWarning, 22 ) 23 24from muutils.json_serialize.util import _FORMAT_KEY, JSONitem 25 26# pylint: disable=unused-argument 27 28ArrayMode = Literal[ 29 "list", 30 "array_list_meta", 31 "array_hex_meta", 32 "array_b64_meta", 33 "external", 34 "zero_dim", 35] 36 37 38def array_n_elements(arr) -> int: # type: ignore[name-defined] 39 """get the number of elements in an array""" 40 if isinstance(arr, np.ndarray): 41 return arr.size 42 elif str(type(arr)) == "<class 'torch.Tensor'>": 43 return arr.nelement() 44 else: 45 raise TypeError(f"invalid type: {type(arr)}") 46 47 48def arr_metadata(arr) -> dict[str, list[int] | str | int]: 49 """get metadata for a numpy array""" 50 return { 51 "shape": list(arr.shape), 52 "dtype": ( 53 arr.dtype.__name__ if hasattr(arr.dtype, "__name__") else str(arr.dtype) 54 ), 55 "n_elements": array_n_elements(arr), 56 } 57 58 59def serialize_array( 60 jser: "JsonSerializer", # type: ignore[name-defined] # noqa: F821 61 arr: np.ndarray, 62 path: str | Sequence[str | int], 63 array_mode: ArrayMode | None = None, 64) -> JSONitem: 65 """serialize a numpy or pytorch array in one of several modes 66 67 if the object is zero-dimensional, simply get the unique item 68 69 `array_mode: ArrayMode` can be one of: 70 - `list`: serialize as a list of values, no metadata (equivalent to `arr.tolist()`) 71 - `array_list_meta`: serialize dict with metadata, actual list under the key `data` 72 - `array_hex_meta`: serialize dict with metadata, actual hex string under the key `data` 73 - `array_b64_meta`: serialize dict with metadata, actual base64 string under the key `data` 74 75 for `array_list_meta`, `array_hex_meta`, and `array_b64_meta`, the serialized object is: 76 ``` 77 { 78 _FORMAT_KEY: <array_list_meta|array_hex_meta>, 79 "shape": arr.shape, 80 "dtype": str(arr.dtype), 81 "data": <arr.tolist()|arr.tobytes().hex()|base64.b64encode(arr.tobytes()).decode()>, 82 } 83 ``` 84 85 # Parameters: 86 - `arr : Any` array to serialize 87 - `array_mode : ArrayMode` mode in which to serialize the array 88 (defaults to `None` and inheriting from `jser: JsonSerializer`) 89 90 # Returns: 91 - `JSONitem` 92 json serialized array 93 94 # Raises: 95 - `KeyError` : if the array mode is not valid 96 """ 97 98 if array_mode is None: 99 array_mode = jser.array_mode 100 101 arr_type: str = f"{type(arr).__module__}.{type(arr).__name__}" 102 arr_np: np.ndarray = arr if isinstance(arr, np.ndarray) else np.array(arr) 103 104 # handle zero-dimensional arrays 105 if len(arr.shape) == 0: 106 return { 107 _FORMAT_KEY: f"{arr_type}:zero_dim", 108 "data": arr.item(), 109 **arr_metadata(arr), 110 } 111 112 if array_mode == "array_list_meta": 113 return { 114 _FORMAT_KEY: f"{arr_type}:array_list_meta", 115 "data": arr_np.tolist(), 116 **arr_metadata(arr_np), 117 } 118 elif array_mode == "list": 119 return arr_np.tolist() 120 elif array_mode == "array_hex_meta": 121 return { 122 _FORMAT_KEY: f"{arr_type}:array_hex_meta", 123 "data": arr_np.tobytes().hex(), 124 **arr_metadata(arr_np), 125 } 126 elif array_mode == "array_b64_meta": 127 return { 128 _FORMAT_KEY: f"{arr_type}:array_b64_meta", 129 "data": base64.b64encode(arr_np.tobytes()).decode(), 130 **arr_metadata(arr_np), 131 } 132 else: 133 raise KeyError(f"invalid array_mode: {array_mode}") 134 135 136def infer_array_mode(arr: JSONitem) -> ArrayMode: 137 """given a serialized array, infer the mode 138 139 assumes the array was serialized via `serialize_array()` 140 """ 141 if isinstance(arr, typing.Mapping): 142 fmt: str = arr.get(_FORMAT_KEY, "") # type: ignore 143 if fmt.endswith(":array_list_meta"): 144 if not isinstance(arr["data"], Iterable): 145 raise ValueError(f"invalid list format: {type(arr['data']) = }\t{arr}") 146 return "array_list_meta" 147 elif fmt.endswith(":array_hex_meta"): 148 if not isinstance(arr["data"], str): 149 raise ValueError(f"invalid hex format: {type(arr['data']) = }\t{arr}") 150 return "array_hex_meta" 151 elif fmt.endswith(":array_b64_meta"): 152 if not isinstance(arr["data"], str): 153 raise ValueError(f"invalid b64 format: {type(arr['data']) = }\t{arr}") 154 return "array_b64_meta" 155 elif fmt.endswith(":external"): 156 return "external" 157 elif fmt.endswith(":zero_dim"): 158 return "zero_dim" 159 else: 160 raise ValueError(f"invalid format: {arr}") 161 elif isinstance(arr, list): 162 return "list" 163 else: 164 raise ValueError(f"cannot infer array_mode from\t{type(arr) = }\n{arr = }") 165 166 167def load_array(arr: JSONitem, array_mode: Optional[ArrayMode] = None) -> Any: 168 """load a json-serialized array, infer the mode if not specified""" 169 # return arr if its already a numpy array 170 if isinstance(arr, np.ndarray) and array_mode is None: 171 return arr 172 173 # try to infer the array_mode 174 array_mode_inferred: ArrayMode = infer_array_mode(arr) 175 if array_mode is None: 176 array_mode = array_mode_inferred 177 elif array_mode != array_mode_inferred: 178 warnings.warn( 179 f"array_mode {array_mode} does not match inferred array_mode {array_mode_inferred}" 180 ) 181 182 # actually load the array 183 if array_mode == "array_list_meta": 184 assert isinstance( 185 arr, typing.Mapping 186 ), f"invalid list format: {type(arr) = }\n{arr = }" 187 data = np.array(arr["data"], dtype=arr["dtype"]) # type: ignore 188 if tuple(arr["shape"]) != tuple(data.shape): # type: ignore 189 raise ValueError(f"invalid shape: {arr}") 190 return data 191 192 elif array_mode == "array_hex_meta": 193 assert isinstance( 194 arr, typing.Mapping 195 ), f"invalid list format: {type(arr) = }\n{arr = }" 196 data = np.frombuffer(bytes.fromhex(arr["data"]), dtype=arr["dtype"]) # type: ignore 197 return data.reshape(arr["shape"]) # type: ignore 198 199 elif array_mode == "array_b64_meta": 200 assert isinstance( 201 arr, typing.Mapping 202 ), f"invalid list format: {type(arr) = }\n{arr = }" 203 data = np.frombuffer(base64.b64decode(arr["data"]), dtype=arr["dtype"]) # type: ignore 204 return data.reshape(arr["shape"]) # type: ignore 205 206 elif array_mode == "list": 207 assert isinstance( 208 arr, typing.Sequence 209 ), f"invalid list format: {type(arr) = }\n{arr = }" 210 return np.array(arr) # type: ignore 211 elif array_mode == "external": 212 # assume ZANJ has taken care of it 213 assert isinstance(arr, typing.Mapping) 214 if "data" not in arr: 215 raise KeyError( 216 f"invalid external array, expected key 'data', got keys: '{list(arr.keys())}' and arr: {arr}" 217 ) 218 return arr["data"] 219 elif array_mode == "zero_dim": 220 assert isinstance(arr, typing.Mapping) 221 data = np.array(arr["data"]) 222 if tuple(arr["shape"]) != tuple(data.shape): # type: ignore 223 raise ValueError(f"invalid shape: {arr}") 224 return data 225 else: 226 raise ValueError(f"invalid array_mode: {array_mode}")
ArrayMode =
typing.Literal['list', 'array_list_meta', 'array_hex_meta', 'array_b64_meta', 'external', 'zero_dim']
def
array_n_elements(arr) -> int:
39def array_n_elements(arr) -> int: # type: ignore[name-defined] 40 """get the number of elements in an array""" 41 if isinstance(arr, np.ndarray): 42 return arr.size 43 elif str(type(arr)) == "<class 'torch.Tensor'>": 44 return arr.nelement() 45 else: 46 raise TypeError(f"invalid type: {type(arr)}")
get the number of elements in an array
def
arr_metadata(arr) -> dict[str, list[int] | str | int]:
49def arr_metadata(arr) -> dict[str, list[int] | str | int]: 50 """get metadata for a numpy array""" 51 return { 52 "shape": list(arr.shape), 53 "dtype": ( 54 arr.dtype.__name__ if hasattr(arr.dtype, "__name__") else str(arr.dtype) 55 ), 56 "n_elements": array_n_elements(arr), 57 }
get metadata for a numpy array
def
serialize_array( jser: "'JsonSerializer'", arr: numpy.ndarray, path: Union[str, Sequence[str | int]], array_mode: Optional[Literal['list', 'array_list_meta', 'array_hex_meta', 'array_b64_meta', 'external', 'zero_dim']] = None) -> Union[bool, int, float, str, NoneType, List[Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]], Dict[str, Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]]]:
60def serialize_array( 61 jser: "JsonSerializer", # type: ignore[name-defined] # noqa: F821 62 arr: np.ndarray, 63 path: str | Sequence[str | int], 64 array_mode: ArrayMode | None = None, 65) -> JSONitem: 66 """serialize a numpy or pytorch array in one of several modes 67 68 if the object is zero-dimensional, simply get the unique item 69 70 `array_mode: ArrayMode` can be one of: 71 - `list`: serialize as a list of values, no metadata (equivalent to `arr.tolist()`) 72 - `array_list_meta`: serialize dict with metadata, actual list under the key `data` 73 - `array_hex_meta`: serialize dict with metadata, actual hex string under the key `data` 74 - `array_b64_meta`: serialize dict with metadata, actual base64 string under the key `data` 75 76 for `array_list_meta`, `array_hex_meta`, and `array_b64_meta`, the serialized object is: 77 ``` 78 { 79 _FORMAT_KEY: <array_list_meta|array_hex_meta>, 80 "shape": arr.shape, 81 "dtype": str(arr.dtype), 82 "data": <arr.tolist()|arr.tobytes().hex()|base64.b64encode(arr.tobytes()).decode()>, 83 } 84 ``` 85 86 # Parameters: 87 - `arr : Any` array to serialize 88 - `array_mode : ArrayMode` mode in which to serialize the array 89 (defaults to `None` and inheriting from `jser: JsonSerializer`) 90 91 # Returns: 92 - `JSONitem` 93 json serialized array 94 95 # Raises: 96 - `KeyError` : if the array mode is not valid 97 """ 98 99 if array_mode is None: 100 array_mode = jser.array_mode 101 102 arr_type: str = f"{type(arr).__module__}.{type(arr).__name__}" 103 arr_np: np.ndarray = arr if isinstance(arr, np.ndarray) else np.array(arr) 104 105 # handle zero-dimensional arrays 106 if len(arr.shape) == 0: 107 return { 108 _FORMAT_KEY: f"{arr_type}:zero_dim", 109 "data": arr.item(), 110 **arr_metadata(arr), 111 } 112 113 if array_mode == "array_list_meta": 114 return { 115 _FORMAT_KEY: f"{arr_type}:array_list_meta", 116 "data": arr_np.tolist(), 117 **arr_metadata(arr_np), 118 } 119 elif array_mode == "list": 120 return arr_np.tolist() 121 elif array_mode == "array_hex_meta": 122 return { 123 _FORMAT_KEY: f"{arr_type}:array_hex_meta", 124 "data": arr_np.tobytes().hex(), 125 **arr_metadata(arr_np), 126 } 127 elif array_mode == "array_b64_meta": 128 return { 129 _FORMAT_KEY: f"{arr_type}:array_b64_meta", 130 "data": base64.b64encode(arr_np.tobytes()).decode(), 131 **arr_metadata(arr_np), 132 } 133 else: 134 raise KeyError(f"invalid array_mode: {array_mode}")
serialize a numpy or pytorch array in one of several modes
if the object is zero-dimensional, simply get the unique item
array_mode: ArrayMode
can be one of:
list
: serialize as a list of values, no metadata (equivalent toarr.tolist()
)array_list_meta
: serialize dict with metadata, actual list under the keydata
array_hex_meta
: serialize dict with metadata, actual hex string under the keydata
array_b64_meta
: serialize dict with metadata, actual base64 string under the keydata
for array_list_meta
, array_hex_meta
, and array_b64_meta
, the serialized object is:
{
_FORMAT_KEY: <array_list_meta|array_hex_meta>,
"shape": arr.shape,
"dtype": str(arr.dtype),
"data": <arr.tolist()|arr.tobytes().hex()|base64.b64encode(arr.tobytes()).decode()>,
}
Parameters:
arr : Any
array to serializearray_mode : ArrayMode
mode in which to serialize the array (defaults toNone
and inheriting fromjser: JsonSerializer
)
Returns:
JSONitem
json serialized array
Raises:
KeyError
: if the array mode is not valid
def
infer_array_mode( arr: Union[bool, int, float, str, NoneType, List[Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]], Dict[str, Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]]]) -> Literal['list', 'array_list_meta', 'array_hex_meta', 'array_b64_meta', 'external', 'zero_dim']:
137def infer_array_mode(arr: JSONitem) -> ArrayMode: 138 """given a serialized array, infer the mode 139 140 assumes the array was serialized via `serialize_array()` 141 """ 142 if isinstance(arr, typing.Mapping): 143 fmt: str = arr.get(_FORMAT_KEY, "") # type: ignore 144 if fmt.endswith(":array_list_meta"): 145 if not isinstance(arr["data"], Iterable): 146 raise ValueError(f"invalid list format: {type(arr['data']) = }\t{arr}") 147 return "array_list_meta" 148 elif fmt.endswith(":array_hex_meta"): 149 if not isinstance(arr["data"], str): 150 raise ValueError(f"invalid hex format: {type(arr['data']) = }\t{arr}") 151 return "array_hex_meta" 152 elif fmt.endswith(":array_b64_meta"): 153 if not isinstance(arr["data"], str): 154 raise ValueError(f"invalid b64 format: {type(arr['data']) = }\t{arr}") 155 return "array_b64_meta" 156 elif fmt.endswith(":external"): 157 return "external" 158 elif fmt.endswith(":zero_dim"): 159 return "zero_dim" 160 else: 161 raise ValueError(f"invalid format: {arr}") 162 elif isinstance(arr, list): 163 return "list" 164 else: 165 raise ValueError(f"cannot infer array_mode from\t{type(arr) = }\n{arr = }")
given a serialized array, infer the mode
assumes the array was serialized via serialize_array()
def
load_array( arr: Union[bool, int, float, str, NoneType, List[Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]], Dict[str, Union[bool, int, float, str, NoneType, List[Any], Dict[str, Any]]]], array_mode: Optional[Literal['list', 'array_list_meta', 'array_hex_meta', 'array_b64_meta', 'external', 'zero_dim']] = None) -> Any:
168def load_array(arr: JSONitem, array_mode: Optional[ArrayMode] = None) -> Any: 169 """load a json-serialized array, infer the mode if not specified""" 170 # return arr if its already a numpy array 171 if isinstance(arr, np.ndarray) and array_mode is None: 172 return arr 173 174 # try to infer the array_mode 175 array_mode_inferred: ArrayMode = infer_array_mode(arr) 176 if array_mode is None: 177 array_mode = array_mode_inferred 178 elif array_mode != array_mode_inferred: 179 warnings.warn( 180 f"array_mode {array_mode} does not match inferred array_mode {array_mode_inferred}" 181 ) 182 183 # actually load the array 184 if array_mode == "array_list_meta": 185 assert isinstance( 186 arr, typing.Mapping 187 ), f"invalid list format: {type(arr) = }\n{arr = }" 188 data = np.array(arr["data"], dtype=arr["dtype"]) # type: ignore 189 if tuple(arr["shape"]) != tuple(data.shape): # type: ignore 190 raise ValueError(f"invalid shape: {arr}") 191 return data 192 193 elif array_mode == "array_hex_meta": 194 assert isinstance( 195 arr, typing.Mapping 196 ), f"invalid list format: {type(arr) = }\n{arr = }" 197 data = np.frombuffer(bytes.fromhex(arr["data"]), dtype=arr["dtype"]) # type: ignore 198 return data.reshape(arr["shape"]) # type: ignore 199 200 elif array_mode == "array_b64_meta": 201 assert isinstance( 202 arr, typing.Mapping 203 ), f"invalid list format: {type(arr) = }\n{arr = }" 204 data = np.frombuffer(base64.b64decode(arr["data"]), dtype=arr["dtype"]) # type: ignore 205 return data.reshape(arr["shape"]) # type: ignore 206 207 elif array_mode == "list": 208 assert isinstance( 209 arr, typing.Sequence 210 ), f"invalid list format: {type(arr) = }\n{arr = }" 211 return np.array(arr) # type: ignore 212 elif array_mode == "external": 213 # assume ZANJ has taken care of it 214 assert isinstance(arr, typing.Mapping) 215 if "data" not in arr: 216 raise KeyError( 217 f"invalid external array, expected key 'data', got keys: '{list(arr.keys())}' and arr: {arr}" 218 ) 219 return arr["data"] 220 elif array_mode == "zero_dim": 221 assert isinstance(arr, typing.Mapping) 222 data = np.array(arr["data"]) 223 if tuple(arr["shape"]) != tuple(data.shape): # type: ignore 224 raise ValueError(f"invalid shape: {arr}") 225 return data 226 else: 227 raise ValueError(f"invalid array_mode: {array_mode}")
load a json-serialized array, infer the mode if not specified