correldata
Read/write vectors of correlated data from/to a csv file.
These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.
1""" 2Read/write vectors of correlated data from/to a csv file. 3 4These data are stored in a dictionary, whose values are numpy arrays 5with elements which may be strings, floats, or floats with associated uncertainties 6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library. 7""" 8 9 10__author__ = 'Mathieu Daëron' 11__contact__ = 'mathieu@daeron.fr' 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron' 13__license__ = 'MIT License - https://opensource.org/licenses/MIT' 14__date__ = '2024-10-13' 15__version__ = '1.2.2' 16 17 18import os as _os 19import numpy as _np 20import uncertainties as _uc 21 22from typing import Callable, Hashable, Any 23 24class uarray(_np.ndarray): 25 26 __doc__ = """ 27 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 28 of [ufloat](https://pypi.org/project/uncertainties) values 29 """ 30 31 def __new__(cls, a): 32 obj = _np.asarray(a).view(cls) 33 return obj 34 35 n = property(fget = _np.vectorize(lambda x : x.n)) 36 """Return the array of nominal values (read-only).""" 37 38 s = property(fget = _np.vectorize(lambda x : x.s)) 39 """Return the array of standard errors (read-only)""" 40 41 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x))) 42 """Return the correlation matrix of the array elements (read-only)""" 43 44 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x))) 45 """Return the covariance matrix of the array elements (read-only)""" 46 47 nv = n 48 "Alias for `uarray.nv`" 49 50 se = s 51 "Alias for `uarray.s`" 52 53 cor = correl 54 "Alias for `uarray.correl`" 55 56 cov = covar 57 "Alias for `uarray.covar`" 58 59 60def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 61 ''' 62 Test whether 2-D array `M` is symmetric and positive semidefinite. 63 ''' 64 ev = _np.linalg.eigvals(M) 65 return ( 66 _np.allclose(M, M.T) # M is symmetric 67 and _np.all( 68 (ev > 0) | _np.isclose(ev, 0) 69 ) # all eignevalues are either real and strictly positive or close to zero 70 ) 71 72 73def smart_type(x: str): 74 ''' 75 Tries to convert string `x` to a float if it includes a decimal point, or 76 to an integer if it does not. If both attempts fail, return the original 77 string unchanged. 78 ''' 79 try: 80 y = float(x) 81 except ValueError: 82 return x 83 if y % 1 == 0 and '.' not in x: 84 return int(y) 85 return y 86 87 88def read_data(data: str, sep: str = ',', validate_covar: bool = True): 89 ''' 90 Read correlated data from a CSV-like string. 91 92 Column names are interpreted in the following way: 93 * In most cases, each columns is converted to a dict value, with the corresponding 94 dict key being the column's label. 95 * Columns whose label starts with `SE` are interpreted as specifying the standard 96 error for the latest preceding data column. 97 * Columns whose label starts with `correl` are interpreted as specifying the 98 correlation matrix for the latest preceding data column. In that case, column labels 99 are ignored for the rest of the columns belonging to this matrix. 100 * Columns whose label starts with `covar` are interpreted as specifying the 101 covariance matrix for the latest preceding data column. In that case, column labels 102 are ignored for the rest of the columns belonging to this matrix. 103 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 104 the latest preceding data column, by adding an underscore followed by the variable's 105 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 106 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 107 underscore followed by the two variable labels, joined by a second underscore 108 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 109 correspond, respectively, to the lines and columns of this matrix. 110 * Exceptions will be raised, for any given variable: 111 - when specifying both `covar` and any combination of (`SE`, `correl`) 112 - when specifying `correl` without `SE` 113 114 **Arguments** 115 - `data`: a CSV-like string 116 - `sep`: the CSV separator 117 - `validate_covar`: whether to check that the overall covariance matrix 118 is symmetric and positive semidefinite. Specifying `validate_covar = False` 119 bypasses this computationally expensive step. 120 121 **Example** 122 ```py 123 import correldata 124 data = """ 125 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 126 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 127 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 128 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 129 """[1:-1] 130 print(correldata.read_data(data)) 131 132 # yields: 133 # 134 # > { 135 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 136 # 'Tacid': array([90., 90., 90.]), 137 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 138 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 139 # } 140 ``` 141 ''' 142 143 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 144 N = len(data) - 1 145 146 values, se, correl, covar = {}, {}, {}, {} 147 j = 0 148 while j < len(data[0]): 149 field = data[0][j] 150 if not ( 151 field.startswith('SE_') 152 or field.startswith('correl_') 153 or field.startswith('covar_') 154 or field == 'SE' 155 or field == 'correl' 156 or field == 'covar' 157 or len(field) == 0 158 ): 159 values[field] = _np.array([l[j] for l in data[1:]]) 160 j += 1 161 oldfield = field 162 elif field.startswith('SE_'): 163 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 164 j += 1 165 elif field == 'SE': 166 se[oldfield] = _np.array([l[j] for l in data[1:]]) 167 j += 1 168 elif field.startswith('correl_'): 169 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 170 j += N 171 elif field == 'correl': 172 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 173 j += N 174 elif field.startswith('covar_'): 175 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 176 j += N 177 elif field == 'covar': 178 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 179 j += N 180 181 nakedvalues = {} 182 for k in [_ for _ in values]: 183 if ( 184 k not in se 185 and k not in correl 186 and k not in covar 187 ): 188 nakedvalues[k] = values.pop(k) 189 190 for x in values: 191 if x in covar: 192 if x in se: 193 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 194 if x in correl: 195 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 196 if x in correl: 197 if x not in se: 198 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 199 200 for x in correl: 201 if x in values: 202 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 203 else: 204 for x1 in values: 205 for x2 in values: 206 if x == f'{x1}_{x2}': 207 if x1 in se: 208 se1 = se[x1] 209 else: 210 if x1 in covar: 211 se1 = _np.diag(covar[x1])**0.5 212 else: 213 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 214 if x2 in se: 215 se2 = se[x2] 216 else: 217 if x2 in covar: 218 se2 = _np.diag(covar[x2])**0.5 219 else: 220 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 221 222 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 223 224 for x in se: 225 if x in values and x not in correl: 226 covar[x] = _np.diag(se[x]**2) 227 228 for k in [_ for _ in covar]: 229 if k not in values: 230 for j1 in values: 231 for j2 in values: 232 if k == f'{j1}_{j2}': 233 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 234 235 X = _np.array([_ for k in values for _ in values[k]]) 236 CM = _np.zeros((X.size, X.size)) 237 for i, vi in enumerate(values): 238 for j, vj in enumerate(values): 239 if vi == vj: 240 if vi in covar: 241 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 242 else: 243 if f'{vi}_{vj}' in covar: 244 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 245 246 if validate_covar and not is_symmetric_positive_semidefinite(CM): 247 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 248 249 corvalues = uarray(_uc.correlated_values(X, CM)) 250 251 allvalues = nakedvalues 252 253 for i, x in enumerate(values): 254 allvalues[x] = corvalues[i*N:i*N+N] 255 256 return allvalues 257 258 259def read_data_from_file(filename: str | _os.PathLike, **kwargs): 260 ''' 261 Read correlated data from a CSV file. 262 263 **Arguments** 264 - `filename`: `str` or path to the file to read from 265 - `kwargs`: passed to correldata.read_data() 266 ''' 267 with open(filename) as fid: 268 return read_data(fid.read(), **kwargs) 269 270 271def f2s( 272 x: Any, 273 f: (str | Callable | dict), 274 k: Hashable = None, 275 fb: (str | Callable) = 'z.6g', 276) -> str: 277 ''' 278 Format `x` according to format `f` 279 280 * If `f` is a string, return `f'{x:{f}}'` 281 * If `f` is a callable, return `f(x)` 282 * If `f` is a dict and optional argument `k` is a hashable, 283 return f2s(x, f[k]), otherwise return f2s(x, fb) 284 ''' 285 286 if isinstance (x, str): 287 return x 288 if isinstance (f, str): 289 return f'{x:{f}}' 290 if isinstance (f, Callable): 291 return f(x) 292 if isinstance (f, dict): 293 if k in f: 294 return f2s(x, f[k]) 295 if isinstance (fb, str): 296 return f'{x:{fb}}' 297 if isinstance (fb, Callable): 298 return fb(x) 299 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.') 300 301 302 303def data_string( 304 data: dict, 305 sep: str = ',', 306 include_fields: list = None, 307 exclude_fields: list = [], 308 float_format: (str | dict | Callable) = 'z.6g', 309 correl_format: (str | dict | Callable) = 'z.6f', 310 default_float_format: (str | Callable) = 'z.6g', 311 default_correl_format: (str | Callable) = 'z.6f', 312 align: str = '>', 313 atol: float = 1e-12, 314 rtol: float = 1e-12, 315): 316 ''' 317 Generate CSV-like string from correlated data 318 319 **Arguments** 320 - `data`: dict of arrays with strings, floats or correlated data 321 - `sep`: the CSV separator 322 - `include_fields`: subset of fields to write; if `None`, write all fields 323 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 324 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 325 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 326 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 327 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 328 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 329 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 330 missing from `float_format.keys()` will use `default_float_format` instead. 331 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 332 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 333 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 334 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 335 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 336 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 337 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 338 339 340 **Example** 341 342 ```py 343 from correldata import _uc 344 from correldata import _np 345 from correldata import * 346 347 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 348 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 349 350 data = dict(X=X, Y=Y, Z=X+Y) 351 352 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 353 354 # yields: 355 # 356 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 357 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 358 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 359 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 360 ``` 361 ''' 362 if include_fields is None: 363 include_fields = [_ for _ in data] 364 cols, ufields = [], [] 365 for f in include_fields: 366 if f in exclude_fields: 367 continue 368 if isinstance(data[f], uarray): 369 ufields.append(f) 370 N = data[f].size 371 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 372 if f'SE_{f}' not in exclude_fields: 373 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 374 if f'correl_{f}' not in exclude_fields: 375 CM = _uc.correlation_matrix(data[f]) 376 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 377 for i in range(N): 378 cols.append( 379 ['' if i else f'correl_{f}'] 380 + [ 381 f2s( 382 CM[i,j], 383 correl_format, 384 f, 385 default_correl_format, 386 ) 387 for j in range(N) 388 ] 389 ) 390 391 else: 392 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 393 394 for i in range(len(ufields)): 395 for j in range(i): 396 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 397 continue 398 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 399 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 400 for k in range(N): 401 cols.append( 402 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 403 + [ 404 f2s( 405 CM[k,l], 406 correl_format, 407 f, 408 default_correl_format, 409 ) 410 for l in range(N) 411 ] 412 ) 413 414 lines = list(map(list, zip(*cols))) 415 416 if align: 417 lengths = [max([len(e) for e in l]) for l in cols] 418 for l in lines: 419 for k,ln in enumerate(lengths): 420 l[k] = f'{l[k]:{align}{ln}s}' 421 return '\n'.join([(sep+' ').join(l) for l in lines]) 422 423 return '\n'.join([sep.join(l) for l in lines]) 424 425 426 427def save_data_to_file(data, filename, **kwargs): 428 ''' 429 Write correlated data to a CSV file. 430 431 **Arguments** 432 - `data`: dict of arrays with strings, floats or correlated data 433 - `filename`: `str` or path to the file to read from 434 - `kwargs`: passed to correldata.data_string() 435 ''' 436 with open(filename, 'w') as fid: 437 return fid.write(data_string(data, **kwargs))
25class uarray(_np.ndarray): 26 27 __doc__ = """ 28 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 29 of [ufloat](https://pypi.org/project/uncertainties) values 30 """ 31 32 def __new__(cls, a): 33 obj = _np.asarray(a).view(cls) 34 return obj 35 36 n = property(fget = _np.vectorize(lambda x : x.n)) 37 """Return the array of nominal values (read-only).""" 38 39 s = property(fget = _np.vectorize(lambda x : x.s)) 40 """Return the array of standard errors (read-only)""" 41 42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x))) 43 """Return the correlation matrix of the array elements (read-only)""" 44 45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x))) 46 """Return the covariance matrix of the array elements (read-only)""" 47 48 nv = n 49 "Alias for `uarray.nv`" 50 51 se = s 52 "Alias for `uarray.s`" 53 54 cor = correl 55 "Alias for `uarray.correl`" 56 57 cov = covar 58 "Alias for `uarray.covar`"
42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
Return the correlation matrix of the array elements (read-only)
45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
Return the covariance matrix of the array elements (read-only)
42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
Alias for uarray.correl
45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
Alias for uarray.covar
Inherited Members
- numpy.ndarray
- dumps
- dump
- all
- any
- argmax
- argmin
- argpartition
- argsort
- astype
- byteswap
- choose
- clip
- compress
- conj
- conjugate
- copy
- cumprod
- cumsum
- diagonal
- dot
- fill
- flatten
- getfield
- item
- max
- mean
- min
- nonzero
- partition
- prod
- put
- ravel
- repeat
- reshape
- resize
- round
- searchsorted
- setfield
- setflags
- sort
- squeeze
- std
- sum
- swapaxes
- take
- tobytes
- tofile
- tolist
- tostring
- trace
- transpose
- var
- view
- to_device
- ndim
- flags
- shape
- strides
- data
- itemsize
- size
- nbytes
- base
- dtype
- real
- imag
- flat
- ctypes
- T
- mT
- ptp
- newbyteorder
- itemset
- device
61def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 62 ''' 63 Test whether 2-D array `M` is symmetric and positive semidefinite. 64 ''' 65 ev = _np.linalg.eigvals(M) 66 return ( 67 _np.allclose(M, M.T) # M is symmetric 68 and _np.all( 69 (ev > 0) | _np.isclose(ev, 0) 70 ) # all eignevalues are either real and strictly positive or close to zero 71 )
Test whether 2-D array M
is symmetric and positive semidefinite.
74def smart_type(x: str): 75 ''' 76 Tries to convert string `x` to a float if it includes a decimal point, or 77 to an integer if it does not. If both attempts fail, return the original 78 string unchanged. 79 ''' 80 try: 81 y = float(x) 82 except ValueError: 83 return x 84 if y % 1 == 0 and '.' not in x: 85 return int(y) 86 return y
Tries to convert string x
to a float if it includes a decimal point, or
to an integer if it does not. If both attempts fail, return the original
string unchanged.
89def read_data(data: str, sep: str = ',', validate_covar: bool = True): 90 ''' 91 Read correlated data from a CSV-like string. 92 93 Column names are interpreted in the following way: 94 * In most cases, each columns is converted to a dict value, with the corresponding 95 dict key being the column's label. 96 * Columns whose label starts with `SE` are interpreted as specifying the standard 97 error for the latest preceding data column. 98 * Columns whose label starts with `correl` are interpreted as specifying the 99 correlation matrix for the latest preceding data column. In that case, column labels 100 are ignored for the rest of the columns belonging to this matrix. 101 * Columns whose label starts with `covar` are interpreted as specifying the 102 covariance matrix for the latest preceding data column. In that case, column labels 103 are ignored for the rest of the columns belonging to this matrix. 104 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 105 the latest preceding data column, by adding an underscore followed by the variable's 106 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 107 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 108 underscore followed by the two variable labels, joined by a second underscore 109 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 110 correspond, respectively, to the lines and columns of this matrix. 111 * Exceptions will be raised, for any given variable: 112 - when specifying both `covar` and any combination of (`SE`, `correl`) 113 - when specifying `correl` without `SE` 114 115 **Arguments** 116 - `data`: a CSV-like string 117 - `sep`: the CSV separator 118 - `validate_covar`: whether to check that the overall covariance matrix 119 is symmetric and positive semidefinite. Specifying `validate_covar = False` 120 bypasses this computationally expensive step. 121 122 **Example** 123 ```py 124 import correldata 125 data = """ 126 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 127 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 128 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 129 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 130 """[1:-1] 131 print(correldata.read_data(data)) 132 133 # yields: 134 # 135 # > { 136 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 137 # 'Tacid': array([90., 90., 90.]), 138 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 139 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 140 # } 141 ``` 142 ''' 143 144 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 145 N = len(data) - 1 146 147 values, se, correl, covar = {}, {}, {}, {} 148 j = 0 149 while j < len(data[0]): 150 field = data[0][j] 151 if not ( 152 field.startswith('SE_') 153 or field.startswith('correl_') 154 or field.startswith('covar_') 155 or field == 'SE' 156 or field == 'correl' 157 or field == 'covar' 158 or len(field) == 0 159 ): 160 values[field] = _np.array([l[j] for l in data[1:]]) 161 j += 1 162 oldfield = field 163 elif field.startswith('SE_'): 164 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 165 j += 1 166 elif field == 'SE': 167 se[oldfield] = _np.array([l[j] for l in data[1:]]) 168 j += 1 169 elif field.startswith('correl_'): 170 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 171 j += N 172 elif field == 'correl': 173 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 174 j += N 175 elif field.startswith('covar_'): 176 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 177 j += N 178 elif field == 'covar': 179 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 180 j += N 181 182 nakedvalues = {} 183 for k in [_ for _ in values]: 184 if ( 185 k not in se 186 and k not in correl 187 and k not in covar 188 ): 189 nakedvalues[k] = values.pop(k) 190 191 for x in values: 192 if x in covar: 193 if x in se: 194 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 195 if x in correl: 196 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 197 if x in correl: 198 if x not in se: 199 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 200 201 for x in correl: 202 if x in values: 203 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 204 else: 205 for x1 in values: 206 for x2 in values: 207 if x == f'{x1}_{x2}': 208 if x1 in se: 209 se1 = se[x1] 210 else: 211 if x1 in covar: 212 se1 = _np.diag(covar[x1])**0.5 213 else: 214 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 215 if x2 in se: 216 se2 = se[x2] 217 else: 218 if x2 in covar: 219 se2 = _np.diag(covar[x2])**0.5 220 else: 221 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 222 223 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 224 225 for x in se: 226 if x in values and x not in correl: 227 covar[x] = _np.diag(se[x]**2) 228 229 for k in [_ for _ in covar]: 230 if k not in values: 231 for j1 in values: 232 for j2 in values: 233 if k == f'{j1}_{j2}': 234 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 235 236 X = _np.array([_ for k in values for _ in values[k]]) 237 CM = _np.zeros((X.size, X.size)) 238 for i, vi in enumerate(values): 239 for j, vj in enumerate(values): 240 if vi == vj: 241 if vi in covar: 242 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 243 else: 244 if f'{vi}_{vj}' in covar: 245 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 246 247 if validate_covar and not is_symmetric_positive_semidefinite(CM): 248 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 249 250 corvalues = uarray(_uc.correlated_values(X, CM)) 251 252 allvalues = nakedvalues 253 254 for i, x in enumerate(values): 255 allvalues[x] = corvalues[i*N:i*N+N] 256 257 return allvalues
Read correlated data from a CSV-like string.
Column names are interpreted in the following way:
- In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
- Columns whose label starts with
SE
are interpreted as specifying the standard error for the latest preceding data column. - Columns whose label starts with
correl
are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. - Columns whose label starts with
covar
are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. SE
,correl
, andcovar
may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex:SE_foo
,correl_bar
,covar_baz
).correl
, andcovar
may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex:correl_foo_bar
,covar_X_Y
). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.- Exceptions will be raised, for any given variable:
- when specifying both
covar
and any combination of (SE
,correl
) - when specifying
correl
withoutSE
- when specifying both
Arguments
data
: a CSV-like stringsep
: the CSV separatorvalidate_covar
: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifyingvalidate_covar = False
bypasses this computationally expensive step.
Example
import correldata
data = """
Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48
FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0
BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0
BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5
"""[1:-1]
print(read_data(data))
# yields:
#
# > {
# 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
# 'Tacid': array([90., 90., 90.]),
# 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
# 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
# }
260def read_data_from_file(filename: str | _os.PathLike, **kwargs): 261 ''' 262 Read correlated data from a CSV file. 263 264 **Arguments** 265 - `filename`: `str` or path to the file to read from 266 - `kwargs`: passed to correldata.read_data() 267 ''' 268 with open(filename) as fid: 269 return read_data(fid.read(), **kwargs)
Read correlated data from a CSV file.
Arguments
filename
:str
or path to the file to read fromkwargs
: passed to read_data()
272def f2s( 273 x: Any, 274 f: (str | Callable | dict), 275 k: Hashable = None, 276 fb: (str | Callable) = 'z.6g', 277) -> str: 278 ''' 279 Format `x` according to format `f` 280 281 * If `f` is a string, return `f'{x:{f}}'` 282 * If `f` is a callable, return `f(x)` 283 * If `f` is a dict and optional argument `k` is a hashable, 284 return f2s(x, f[k]), otherwise return f2s(x, fb) 285 ''' 286 287 if isinstance (x, str): 288 return x 289 if isinstance (f, str): 290 return f'{x:{f}}' 291 if isinstance (f, Callable): 292 return f(x) 293 if isinstance (f, dict): 294 if k in f: 295 return f2s(x, f[k]) 296 if isinstance (fb, str): 297 return f'{x:{fb}}' 298 if isinstance (fb, Callable): 299 return fb(x) 300 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
Format x
according to format f
- If
f
is a string, returnf'{x:{f}}'
- If
f
is a callable, returnf(x)
- If
f
is a dict and optional argumentk
is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
304def data_string( 305 data: dict, 306 sep: str = ',', 307 include_fields: list = None, 308 exclude_fields: list = [], 309 float_format: (str | dict | Callable) = 'z.6g', 310 correl_format: (str | dict | Callable) = 'z.6f', 311 default_float_format: (str | Callable) = 'z.6g', 312 default_correl_format: (str | Callable) = 'z.6f', 313 align: str = '>', 314 atol: float = 1e-12, 315 rtol: float = 1e-12, 316): 317 ''' 318 Generate CSV-like string from correlated data 319 320 **Arguments** 321 - `data`: dict of arrays with strings, floats or correlated data 322 - `sep`: the CSV separator 323 - `include_fields`: subset of fields to write; if `None`, write all fields 324 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 325 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 326 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 327 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 328 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 329 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 330 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 331 missing from `float_format.keys()` will use `default_float_format` instead. 332 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 333 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 334 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 335 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 336 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 337 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 338 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 339 340 341 **Example** 342 343 ```py 344 from correldata import _uc 345 from correldata import _np 346 from correldata import * 347 348 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 349 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 350 351 data = dict(X=X, Y=Y, Z=X+Y) 352 353 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 354 355 # yields: 356 # 357 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 358 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 359 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 360 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 361 ``` 362 ''' 363 if include_fields is None: 364 include_fields = [_ for _ in data] 365 cols, ufields = [], [] 366 for f in include_fields: 367 if f in exclude_fields: 368 continue 369 if isinstance(data[f], uarray): 370 ufields.append(f) 371 N = data[f].size 372 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 373 if f'SE_{f}' not in exclude_fields: 374 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 375 if f'correl_{f}' not in exclude_fields: 376 CM = _uc.correlation_matrix(data[f]) 377 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 378 for i in range(N): 379 cols.append( 380 ['' if i else f'correl_{f}'] 381 + [ 382 f2s( 383 CM[i,j], 384 correl_format, 385 f, 386 default_correl_format, 387 ) 388 for j in range(N) 389 ] 390 ) 391 392 else: 393 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 394 395 for i in range(len(ufields)): 396 for j in range(i): 397 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 398 continue 399 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 400 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 401 for k in range(N): 402 cols.append( 403 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 404 + [ 405 f2s( 406 CM[k,l], 407 correl_format, 408 f, 409 default_correl_format, 410 ) 411 for l in range(N) 412 ] 413 ) 414 415 lines = list(map(list, zip(*cols))) 416 417 if align: 418 lengths = [max([len(e) for e in l]) for l in cols] 419 for l in lines: 420 for k,ln in enumerate(lengths): 421 l[k] = f'{l[k]:{align}{ln}s}' 422 return '\n'.join([(sep+' ').join(l) for l in lines]) 423 424 return '\n'.join([sep.join(l) for l in lines])
Generate CSV-like string from correlated data
Arguments
data
: dict of arrays with strings, floats or correlated datasep
: the CSV separatorinclude_fields
: subset of fields to write; ifNone
, write all fieldsexclude_fields
: subset of fields to ignore (takes precedence overinclude_fields
); to exclude only the SE for fieldfoo
, includeSE_foo
; same goes forcorrel_foo
float_format
: formatting for float values. May be a string (ex:'z.3f'
), a callable (ex:lambda x: '.2f' if x else '0'
), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex:{'foo': '.2e', 'bar': (lambda x: str(x))}
).correl_format
: same asfloat_format
, but applies to correlation matrix elementsdefault_float_format
: only used whenfloat_format
is a dict; in that case, fields missing fromfloat_format.keys()
will usedefault_float_format
instead. corresponding to different fields (ex:{'foo': '.2e', 'bar':
lambda x: str(x)}
).default_correl_format
: same asdefault_float_format
, but applies tocorrel_format
align
: right-align (>
), left-align (<
), or don't align (empty string) CSV valuesatol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrixrtol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
Example
from correldata import _uc
from correldata import _np
from correldata import *
X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
data = dict(X=X, Y=Y, Z=X+Y)
print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
# yields:
#
# X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, ,
# 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0
# 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0
# 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8
428def save_data_to_file(data, filename, **kwargs): 429 ''' 430 Write correlated data to a CSV file. 431 432 **Arguments** 433 - `data`: dict of arrays with strings, floats or correlated data 434 - `filename`: `str` or path to the file to read from 435 - `kwargs`: passed to correldata.data_string() 436 ''' 437 with open(filename, 'w') as fid: 438 return fid.write(data_string(data, **kwargs))
Write correlated data to a CSV file.
Arguments
data
: dict of arrays with strings, floats or correlated datafilename
:str
or path to the file to read fromkwargs
: passed to data_string()