correldata
Read/write vectors of correlated data from/to a csv file.
These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.
1""" 2Read/write vectors of correlated data from/to a csv file. 3 4These data are stored in a dictionary, whose values are numpy arrays 5with elements which may be strings, floats, or floats with associated uncertainties 6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library. 7""" 8 9 10__author__ = 'Mathieu Daëron' 11__contact__ = 'mathieu@daeron.fr' 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron' 13__license__ = 'MIT License - https://opensource.org/licenses/MIT' 14__date__ = '2024-10-17' 15__version__ = '1.4.0' 16 17 18import os as _os 19import numpy as _np 20import uncertainties as _uc 21 22from typing import Callable, Hashable, Any 23 24class uarray(_np.ndarray): 25 26 __doc__ = """ 27 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 28 of [ufloat](https://pypi.org/project/uncertainties) values 29 """ 30 31 def __new__(cls, a): 32 obj = _np.asarray(a).view(cls) 33 return obj 34 35 @property 36 def nv(self): 37 """Return the array of nominal values (read-only).""" 38 return _uc.unumpy.nominal_values(_np.array(self)) 39 40 @property 41 def se(self): 42 """Return the array of standard errors (read-only)""" 43 return _uc.unumpy.std_devs(_np.array(self)) 44 45 @property 46 def correl(self): 47 """Return the correlation matrix of the array elements (read-only)""" 48 return _np.array(_uc.correlation_matrix(self)) 49 50 @property 51 def covar(self): 52 """Return the covariance matrix of the array elements (read-only)""" 53 return _np.array(_uc.covariance_matrix(self)) 54 55 @property 56 def mahalanobis(self): 57 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 58 flatself = self.n.flatten().reshape((1, self.size)) 59 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0] 60 61 n = nv 62 "Alias for `uarray.nv`" 63 64 s = se 65 "Alias for `uarray.se`" 66 67 cor = correl 68 "Alias for `uarray.correl`" 69 70 cov = covar 71 "Alias for `uarray.covar`" 72 73 m = mahalanobis 74 "Alias for `uarray.mahalanobis`" 75 76 77def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 78 ''' 79 Test whether 2-D array `M` is symmetric and positive semidefinite. 80 ''' 81 ev = _np.linalg.eigvals(M) 82 return ( 83 _np.allclose(M, M.T) # M is symmetric 84 and _np.all( 85 (ev > 0) | _np.isclose(ev, 0) 86 ) # all eignevalues are either real and strictly positive or close to zero 87 ) 88 89 90def smart_type(s: str) -> (int | float | str): 91 ''' 92 Tries to convert string `s` to an `int`, or to an `float` if that fails. 93 If both fail, return the original string unchanged. 94 ''' 95 try: return int(s) 96 except: pass 97 try: return float(s) 98 except: pass 99 return s 100 101 102def read_data(data: str, sep: str = ',', validate_covar: bool = True): 103 ''' 104 Read correlated data from a CSV-like string. 105 106 Column names are interpreted in the following way: 107 * In most cases, each columns is converted to a dict value, with the corresponding 108 dict key being the column's label. 109 * Columns whose label starts with `SE` are interpreted as specifying the standard 110 error for the latest preceding data column. 111 * Columns whose label starts with `correl` are interpreted as specifying the 112 correlation matrix for the latest preceding data column. In that case, column labels 113 are ignored for the rest of the columns belonging to this matrix. 114 * Columns whose label starts with `covar` are interpreted as specifying the 115 covariance matrix for the latest preceding data column. In that case, column labels 116 are ignored for the rest of the columns belonging to this matrix. 117 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 118 the latest preceding data column, by adding an underscore followed by the variable's 119 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 120 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 121 underscore followed by the two variable labels, joined by a second underscore 122 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 123 correspond, respectively, to the lines and columns of this matrix. 124 * Exceptions will be raised, for any given variable: 125 - when specifying both `covar` and any combination of (`SE`, `correl`) 126 - when specifying `correl` without `SE` 127 128 **Arguments** 129 - `data`: a CSV-like string 130 - `sep`: the CSV separator 131 - `validate_covar`: whether to check that the overall covariance matrix 132 is symmetric and positive semidefinite. Specifying `validate_covar = False` 133 bypasses this computationally expensive step. 134 135 **Example** 136 ```py 137 import correldata 138 data = """ 139 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 140 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 141 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 142 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 143 """[1:-1] 144 print(correldata.read_data(data)) 145 146 # yields: 147 # 148 # > { 149 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 150 # 'Tacid': array([90., 90., 90.]), 151 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 152 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 153 # } 154 ``` 155 ''' 156 157 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 158 N = len(data) - 1 159 160 values, se, correl, covar = {}, {}, {}, {} 161 j = 0 162 while j < len(data[0]): 163 field = data[0][j] 164 if not ( 165 field.startswith('SE_') 166 or field.startswith('correl_') 167 or field.startswith('covar_') 168 or field == 'SE' 169 or field == 'correl' 170 or field == 'covar' 171 or len(field) == 0 172 ): 173 values[field] = _np.array([l[j] for l in data[1:]]) 174 j += 1 175 oldfield = field 176 elif field.startswith('SE_'): 177 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 178 j += 1 179 elif field == 'SE': 180 se[oldfield] = _np.array([l[j] for l in data[1:]]) 181 j += 1 182 elif field.startswith('correl_'): 183 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 184 j += N 185 elif field == 'correl': 186 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 187 j += N 188 elif field.startswith('covar_'): 189 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 190 j += N 191 elif field == 'covar': 192 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 193 j += N 194 195 nakedvalues = {} 196 for k in [_ for _ in values]: 197 if ( 198 k not in se 199 and k not in correl 200 and k not in covar 201 ): 202 nakedvalues[k] = values.pop(k) 203 204 for x in values: 205 if x in covar: 206 if x in se: 207 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 208 if x in correl: 209 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 210 if x in correl: 211 if x not in se: 212 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 213 214 for x in correl: 215 if x in values: 216 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 217 else: 218 for x1 in values: 219 for x2 in values: 220 if x == f'{x1}_{x2}': 221 if x1 in se: 222 se1 = se[x1] 223 else: 224 if x1 in covar: 225 se1 = _np.diag(covar[x1])**0.5 226 else: 227 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 228 if x2 in se: 229 se2 = se[x2] 230 else: 231 if x2 in covar: 232 se2 = _np.diag(covar[x2])**0.5 233 else: 234 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 235 236 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 237 238 for x in se: 239 if x in values and x not in correl: 240 covar[x] = _np.diag(se[x]**2) 241 242 for k in [_ for _ in covar]: 243 if k not in values: 244 for j1 in values: 245 for j2 in values: 246 if k == f'{j1}_{j2}': 247 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 248 249 X = _np.array([_ for k in values for _ in values[k]]) 250 CM = _np.zeros((X.size, X.size)) 251 for i, vi in enumerate(values): 252 for j, vj in enumerate(values): 253 if vi == vj: 254 if vi in covar: 255 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 256 else: 257 if f'{vi}_{vj}' in covar: 258 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 259 260 s = _np.diag(CM)**.5 261 s[s==0] = 1. 262 invs = _np.diag(s**-1) 263 264 if ( 265 validate_covar 266 and not ( 267 is_symmetric_positive_semidefinite(CM) 268 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 269 ) 270 ): 271 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 272 273 corvalues = uarray(_uc.correlated_values(X, CM)) 274 275 allvalues = nakedvalues 276 277 for i, x in enumerate(values): 278 allvalues[x] = corvalues[i*N:i*N+N] 279 280 return allvalues 281 282 283def read_data_from_file(filename: str | _os.PathLike, **kwargs): 284 ''' 285 Read correlated data from a CSV file. 286 287 **Arguments** 288 - `filename`: `str` or path to the file to read from 289 - `kwargs`: passed to correldata.read_data() 290 ''' 291 with open(filename) as fid: 292 return read_data(fid.read(), **kwargs) 293 294 295def f2s( 296 x: Any, 297 f: (str | Callable | dict), 298 k: Hashable = None, 299 fb: (str | Callable) = 'z.6g', 300) -> str: 301 ''' 302 Format `x` according to format `f` 303 304 * If `f` is a string, return `f'{x:{f}}'` 305 * If `f` is a callable, return `f(x)` 306 * If `f` is a dict and optional argument `k` is a hashable, 307 return f2s(x, f[k]), otherwise return f2s(x, fb) 308 ''' 309 if isinstance (x, str): 310 return x 311 if isinstance (f, str): 312 return f'{x:{f}}' 313 if isinstance (f, Callable): 314 return f(x) 315 if isinstance (f, dict): 316 if k in f: 317 return f2s(x, f[k]) 318 if isinstance (fb, str): 319 return f'{x:{fb}}' 320 if isinstance (fb, Callable): 321 return fb(x) 322 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.') 323 324 325 326def data_string( 327 data: dict, 328 sep: str = ',', 329 include_fields: list = None, 330 exclude_fields: list = [], 331 float_format: (str | dict | Callable) = 'z.6g', 332 correl_format: (str | dict | Callable) = 'z.6f', 333 default_float_format: (str | Callable) = 'z.6g', 334 default_correl_format: (str | Callable) = 'z.6f', 335 show_nv: bool = True, 336 show_se: bool = True, 337 show_correl: bool = True, 338 show_mixed_correl: bool = True, 339 align: str = '>', 340 atol: float = 1e-12, 341 rtol: float = 1e-12, 342): 343 ''' 344 Generate CSV-like string from correlated data 345 346 **Arguments** 347 - `data`: dict of arrays with strings, floats or correlated data 348 - `sep`: the CSV separator 349 - `include_fields`: subset of fields to write; if `None`, write all fields 350 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 351 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 352 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 353 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 354 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 355 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 356 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 357 missing from `float_format.keys()` will use `default_float_format` instead. 358 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 359 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 360 - `show_nv`: show nominal values 361 - `show_se`: show standard errors 362 - `show_correl`: show correlations for any given field (ex: `correl_X`) 363 - `show_mixed_correl`: show correlations between different fields (ex: `correl_X_Y`) 364 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 365 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 366 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 367 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 368 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 369 370 371 **Example** 372 373 ```py 374 from correldata import _uc 375 from correldata import _np 376 from correldata import * 377 378 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 379 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 380 381 data = dict(X=X, Y=Y, Z=X+Y) 382 383 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 384 385 # yields: 386 # 387 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 388 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 389 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 390 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 391 ``` 392 ''' 393 if include_fields is None: 394 include_fields = [_ for _ in data] 395 cols, ufields = [], [] 396 for f in include_fields: 397 if f in exclude_fields: 398 continue 399 if isinstance(data[f], uarray): 400 ufields.append(f) 401 N = data[f].size 402 if show_nv: 403 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 404 if show_se and (f'SE_{f}' not in exclude_fields): 405 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 406 if show_correl and (f'correl_{f}' not in exclude_fields): 407 CM = _uc.correlation_matrix(data[f]) 408 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 409 for i in range(N): 410 cols.append( 411 ['' if i else f'correl_{f}'] 412 + [ 413 f2s( 414 CM[i,j], 415 correl_format, 416 f, 417 default_correl_format, 418 ) 419 for j in range(N) 420 ] 421 ) 422 elif show_nv: 423 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 424 425 if show_mixed_correl: 426 for i in range(len(ufields)): 427 for j in range(i): 428 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 429 continue 430 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 431 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 432 for k in range(N): 433 cols.append( 434 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 435 + [ 436 f2s( 437 CM[k,l], 438 correl_format, 439 f, 440 default_correl_format, 441 ) 442 for l in range(N) 443 ] 444 ) 445 446 lines = list(map(list, zip(*cols))) 447 448 if align: 449 lengths = [max([len(e) for e in l]) for l in cols] 450 for l in lines: 451 for k,ln in enumerate(lengths): 452 l[k] = f'{l[k]:{align}{ln}s}' 453 return '\n'.join([(sep+' ').join(l) for l in lines]) 454 455 return '\n'.join([sep.join(l) for l in lines]) 456 457 458 459def save_data_to_file(data, filename, **kwargs): 460 ''' 461 Write correlated data to a CSV file. 462 463 **Arguments** 464 - `data`: dict of arrays with strings, floats or correlated data 465 - `filename`: `str` or path to the file to read from 466 - `kwargs`: passed to correldata.data_string() 467 ''' 468 with open(filename, 'w') as fid: 469 return fid.write(data_string(data, **kwargs))
25class uarray(_np.ndarray): 26 27 __doc__ = """ 28 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 29 of [ufloat](https://pypi.org/project/uncertainties) values 30 """ 31 32 def __new__(cls, a): 33 obj = _np.asarray(a).view(cls) 34 return obj 35 36 @property 37 def nv(self): 38 """Return the array of nominal values (read-only).""" 39 return _uc.unumpy.nominal_values(_np.array(self)) 40 41 @property 42 def se(self): 43 """Return the array of standard errors (read-only)""" 44 return _uc.unumpy.std_devs(_np.array(self)) 45 46 @property 47 def correl(self): 48 """Return the correlation matrix of the array elements (read-only)""" 49 return _np.array(_uc.correlation_matrix(self)) 50 51 @property 52 def covar(self): 53 """Return the covariance matrix of the array elements (read-only)""" 54 return _np.array(_uc.covariance_matrix(self)) 55 56 @property 57 def mahalanobis(self): 58 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 59 flatself = self.n.flatten().reshape((1, self.size)) 60 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0] 61 62 n = nv 63 "Alias for `uarray.nv`" 64 65 s = se 66 "Alias for `uarray.se`" 67 68 cor = correl 69 "Alias for `uarray.correl`" 70 71 cov = covar 72 "Alias for `uarray.covar`" 73 74 m = mahalanobis 75 "Alias for `uarray.mahalanobis`"
36 @property 37 def nv(self): 38 """Return the array of nominal values (read-only).""" 39 return _uc.unumpy.nominal_values(_np.array(self))
Return the array of nominal values (read-only).
41 @property 42 def se(self): 43 """Return the array of standard errors (read-only)""" 44 return _uc.unumpy.std_devs(_np.array(self))
Return the array of standard errors (read-only)
46 @property 47 def correl(self): 48 """Return the correlation matrix of the array elements (read-only)""" 49 return _np.array(_uc.correlation_matrix(self))
Return the correlation matrix of the array elements (read-only)
51 @property 52 def covar(self): 53 """Return the covariance matrix of the array elements (read-only)""" 54 return _np.array(_uc.covariance_matrix(self))
Return the covariance matrix of the array elements (read-only)
56 @property 57 def mahalanobis(self): 58 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 59 flatself = self.n.flatten().reshape((1, self.size)) 60 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Return the squared Mahalanobis distance from zero of the array (read-only)
36 @property 37 def nv(self): 38 """Return the array of nominal values (read-only).""" 39 return _uc.unumpy.nominal_values(_np.array(self))
Alias for uarray.nv
41 @property 42 def se(self): 43 """Return the array of standard errors (read-only)""" 44 return _uc.unumpy.std_devs(_np.array(self))
Alias for uarray.se
46 @property 47 def correl(self): 48 """Return the correlation matrix of the array elements (read-only)""" 49 return _np.array(_uc.correlation_matrix(self))
Alias for uarray.correl
51 @property 52 def covar(self): 53 """Return the covariance matrix of the array elements (read-only)""" 54 return _np.array(_uc.covariance_matrix(self))
Alias for uarray.covar
56 @property 57 def mahalanobis(self): 58 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 59 flatself = self.n.flatten().reshape((1, self.size)) 60 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Alias for uarray.mahalanobis
Inherited Members
- numpy.ndarray
- dumps
- dump
- all
- any
- argmax
- argmin
- argpartition
- argsort
- astype
- byteswap
- choose
- clip
- compress
- conj
- conjugate
- copy
- cumprod
- cumsum
- diagonal
- dot
- fill
- flatten
- getfield
- item
- max
- mean
- min
- nonzero
- partition
- prod
- put
- ravel
- repeat
- reshape
- resize
- round
- searchsorted
- setfield
- setflags
- sort
- squeeze
- std
- sum
- swapaxes
- take
- tobytes
- tofile
- tolist
- tostring
- trace
- transpose
- var
- view
- to_device
- ndim
- flags
- shape
- strides
- data
- itemsize
- size
- nbytes
- base
- dtype
- real
- imag
- flat
- ctypes
- T
- mT
- ptp
- newbyteorder
- itemset
- device
78def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 79 ''' 80 Test whether 2-D array `M` is symmetric and positive semidefinite. 81 ''' 82 ev = _np.linalg.eigvals(M) 83 return ( 84 _np.allclose(M, M.T) # M is symmetric 85 and _np.all( 86 (ev > 0) | _np.isclose(ev, 0) 87 ) # all eignevalues are either real and strictly positive or close to zero 88 )
Test whether 2-D array M
is symmetric and positive semidefinite.
91def smart_type(s: str) -> (int | float | str): 92 ''' 93 Tries to convert string `s` to an `int`, or to an `float` if that fails. 94 If both fail, return the original string unchanged. 95 ''' 96 try: return int(s) 97 except: pass 98 try: return float(s) 99 except: pass 100 return s
Tries to convert string s
to an int
, or to an float
if that fails.
If both fail, return the original string unchanged.
103def read_data(data: str, sep: str = ',', validate_covar: bool = True): 104 ''' 105 Read correlated data from a CSV-like string. 106 107 Column names are interpreted in the following way: 108 * In most cases, each columns is converted to a dict value, with the corresponding 109 dict key being the column's label. 110 * Columns whose label starts with `SE` are interpreted as specifying the standard 111 error for the latest preceding data column. 112 * Columns whose label starts with `correl` are interpreted as specifying the 113 correlation matrix for the latest preceding data column. In that case, column labels 114 are ignored for the rest of the columns belonging to this matrix. 115 * Columns whose label starts with `covar` are interpreted as specifying the 116 covariance matrix for the latest preceding data column. In that case, column labels 117 are ignored for the rest of the columns belonging to this matrix. 118 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 119 the latest preceding data column, by adding an underscore followed by the variable's 120 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 121 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 122 underscore followed by the two variable labels, joined by a second underscore 123 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 124 correspond, respectively, to the lines and columns of this matrix. 125 * Exceptions will be raised, for any given variable: 126 - when specifying both `covar` and any combination of (`SE`, `correl`) 127 - when specifying `correl` without `SE` 128 129 **Arguments** 130 - `data`: a CSV-like string 131 - `sep`: the CSV separator 132 - `validate_covar`: whether to check that the overall covariance matrix 133 is symmetric and positive semidefinite. Specifying `validate_covar = False` 134 bypasses this computationally expensive step. 135 136 **Example** 137 ```py 138 import correldata 139 data = """ 140 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 141 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 142 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 143 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 144 """[1:-1] 145 print(correldata.read_data(data)) 146 147 # yields: 148 # 149 # > { 150 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 151 # 'Tacid': array([90., 90., 90.]), 152 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 153 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 154 # } 155 ``` 156 ''' 157 158 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 159 N = len(data) - 1 160 161 values, se, correl, covar = {}, {}, {}, {} 162 j = 0 163 while j < len(data[0]): 164 field = data[0][j] 165 if not ( 166 field.startswith('SE_') 167 or field.startswith('correl_') 168 or field.startswith('covar_') 169 or field == 'SE' 170 or field == 'correl' 171 or field == 'covar' 172 or len(field) == 0 173 ): 174 values[field] = _np.array([l[j] for l in data[1:]]) 175 j += 1 176 oldfield = field 177 elif field.startswith('SE_'): 178 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 179 j += 1 180 elif field == 'SE': 181 se[oldfield] = _np.array([l[j] for l in data[1:]]) 182 j += 1 183 elif field.startswith('correl_'): 184 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 185 j += N 186 elif field == 'correl': 187 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 188 j += N 189 elif field.startswith('covar_'): 190 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 191 j += N 192 elif field == 'covar': 193 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 194 j += N 195 196 nakedvalues = {} 197 for k in [_ for _ in values]: 198 if ( 199 k not in se 200 and k not in correl 201 and k not in covar 202 ): 203 nakedvalues[k] = values.pop(k) 204 205 for x in values: 206 if x in covar: 207 if x in se: 208 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 209 if x in correl: 210 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 211 if x in correl: 212 if x not in se: 213 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 214 215 for x in correl: 216 if x in values: 217 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 218 else: 219 for x1 in values: 220 for x2 in values: 221 if x == f'{x1}_{x2}': 222 if x1 in se: 223 se1 = se[x1] 224 else: 225 if x1 in covar: 226 se1 = _np.diag(covar[x1])**0.5 227 else: 228 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 229 if x2 in se: 230 se2 = se[x2] 231 else: 232 if x2 in covar: 233 se2 = _np.diag(covar[x2])**0.5 234 else: 235 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 236 237 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 238 239 for x in se: 240 if x in values and x not in correl: 241 covar[x] = _np.diag(se[x]**2) 242 243 for k in [_ for _ in covar]: 244 if k not in values: 245 for j1 in values: 246 for j2 in values: 247 if k == f'{j1}_{j2}': 248 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 249 250 X = _np.array([_ for k in values for _ in values[k]]) 251 CM = _np.zeros((X.size, X.size)) 252 for i, vi in enumerate(values): 253 for j, vj in enumerate(values): 254 if vi == vj: 255 if vi in covar: 256 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 257 else: 258 if f'{vi}_{vj}' in covar: 259 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 260 261 s = _np.diag(CM)**.5 262 s[s==0] = 1. 263 invs = _np.diag(s**-1) 264 265 if ( 266 validate_covar 267 and not ( 268 is_symmetric_positive_semidefinite(CM) 269 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 270 ) 271 ): 272 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 273 274 corvalues = uarray(_uc.correlated_values(X, CM)) 275 276 allvalues = nakedvalues 277 278 for i, x in enumerate(values): 279 allvalues[x] = corvalues[i*N:i*N+N] 280 281 return allvalues
Read correlated data from a CSV-like string.
Column names are interpreted in the following way:
- In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
- Columns whose label starts with
SE
are interpreted as specifying the standard error for the latest preceding data column. - Columns whose label starts with
correl
are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. - Columns whose label starts with
covar
are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. SE
,correl
, andcovar
may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex:SE_foo
,correl_bar
,covar_baz
).correl
, andcovar
may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex:correl_foo_bar
,covar_X_Y
). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.- Exceptions will be raised, for any given variable:
- when specifying both
covar
and any combination of (SE
,correl
) - when specifying
correl
withoutSE
- when specifying both
Arguments
data
: a CSV-like stringsep
: the CSV separatorvalidate_covar
: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifyingvalidate_covar = False
bypasses this computationally expensive step.
Example
import correldata
data = """
Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48
FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0
BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0
BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5
"""[1:-1]
print(read_data(data))
# yields:
#
# > {
# 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
# 'Tacid': array([90., 90., 90.]),
# 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
# 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
# }
284def read_data_from_file(filename: str | _os.PathLike, **kwargs): 285 ''' 286 Read correlated data from a CSV file. 287 288 **Arguments** 289 - `filename`: `str` or path to the file to read from 290 - `kwargs`: passed to correldata.read_data() 291 ''' 292 with open(filename) as fid: 293 return read_data(fid.read(), **kwargs)
Read correlated data from a CSV file.
Arguments
filename
:str
or path to the file to read fromkwargs
: passed to read_data()
296def f2s( 297 x: Any, 298 f: (str | Callable | dict), 299 k: Hashable = None, 300 fb: (str | Callable) = 'z.6g', 301) -> str: 302 ''' 303 Format `x` according to format `f` 304 305 * If `f` is a string, return `f'{x:{f}}'` 306 * If `f` is a callable, return `f(x)` 307 * If `f` is a dict and optional argument `k` is a hashable, 308 return f2s(x, f[k]), otherwise return f2s(x, fb) 309 ''' 310 if isinstance (x, str): 311 return x 312 if isinstance (f, str): 313 return f'{x:{f}}' 314 if isinstance (f, Callable): 315 return f(x) 316 if isinstance (f, dict): 317 if k in f: 318 return f2s(x, f[k]) 319 if isinstance (fb, str): 320 return f'{x:{fb}}' 321 if isinstance (fb, Callable): 322 return fb(x) 323 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
Format x
according to format f
- If
f
is a string, returnf'{x:{f}}'
- If
f
is a callable, returnf(x)
- If
f
is a dict and optional argumentk
is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
327def data_string( 328 data: dict, 329 sep: str = ',', 330 include_fields: list = None, 331 exclude_fields: list = [], 332 float_format: (str | dict | Callable) = 'z.6g', 333 correl_format: (str | dict | Callable) = 'z.6f', 334 default_float_format: (str | Callable) = 'z.6g', 335 default_correl_format: (str | Callable) = 'z.6f', 336 show_nv: bool = True, 337 show_se: bool = True, 338 show_correl: bool = True, 339 show_mixed_correl: bool = True, 340 align: str = '>', 341 atol: float = 1e-12, 342 rtol: float = 1e-12, 343): 344 ''' 345 Generate CSV-like string from correlated data 346 347 **Arguments** 348 - `data`: dict of arrays with strings, floats or correlated data 349 - `sep`: the CSV separator 350 - `include_fields`: subset of fields to write; if `None`, write all fields 351 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 352 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 353 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 354 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 355 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 356 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 357 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 358 missing from `float_format.keys()` will use `default_float_format` instead. 359 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 360 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 361 - `show_nv`: show nominal values 362 - `show_se`: show standard errors 363 - `show_correl`: show correlations for any given field (ex: `correl_X`) 364 - `show_mixed_correl`: show correlations between different fields (ex: `correl_X_Y`) 365 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 366 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 367 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 368 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 369 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 370 371 372 **Example** 373 374 ```py 375 from correldata import _uc 376 from correldata import _np 377 from correldata import * 378 379 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 380 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 381 382 data = dict(X=X, Y=Y, Z=X+Y) 383 384 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 385 386 # yields: 387 # 388 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 389 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 390 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 391 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 392 ``` 393 ''' 394 if include_fields is None: 395 include_fields = [_ for _ in data] 396 cols, ufields = [], [] 397 for f in include_fields: 398 if f in exclude_fields: 399 continue 400 if isinstance(data[f], uarray): 401 ufields.append(f) 402 N = data[f].size 403 if show_nv: 404 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 405 if show_se and (f'SE_{f}' not in exclude_fields): 406 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 407 if show_correl and (f'correl_{f}' not in exclude_fields): 408 CM = _uc.correlation_matrix(data[f]) 409 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 410 for i in range(N): 411 cols.append( 412 ['' if i else f'correl_{f}'] 413 + [ 414 f2s( 415 CM[i,j], 416 correl_format, 417 f, 418 default_correl_format, 419 ) 420 for j in range(N) 421 ] 422 ) 423 elif show_nv: 424 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 425 426 if show_mixed_correl: 427 for i in range(len(ufields)): 428 for j in range(i): 429 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 430 continue 431 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 432 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 433 for k in range(N): 434 cols.append( 435 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 436 + [ 437 f2s( 438 CM[k,l], 439 correl_format, 440 f, 441 default_correl_format, 442 ) 443 for l in range(N) 444 ] 445 ) 446 447 lines = list(map(list, zip(*cols))) 448 449 if align: 450 lengths = [max([len(e) for e in l]) for l in cols] 451 for l in lines: 452 for k,ln in enumerate(lengths): 453 l[k] = f'{l[k]:{align}{ln}s}' 454 return '\n'.join([(sep+' ').join(l) for l in lines]) 455 456 return '\n'.join([sep.join(l) for l in lines])
Generate CSV-like string from correlated data
Arguments
data
: dict of arrays with strings, floats or correlated datasep
: the CSV separatorinclude_fields
: subset of fields to write; ifNone
, write all fieldsexclude_fields
: subset of fields to ignore (takes precedence overinclude_fields
); to exclude only the SE for fieldfoo
, includeSE_foo
; same goes forcorrel_foo
float_format
: formatting for float values. May be a string (ex:'z.3f'
), a callable (ex:lambda x: '.2f' if x else '0'
), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex:{'foo': '.2e', 'bar': (lambda x: str(x))}
).correl_format
: same asfloat_format
, but applies to correlation matrix elementsdefault_float_format
: only used whenfloat_format
is a dict; in that case, fields missing fromfloat_format.keys()
will usedefault_float_format
instead. corresponding to different fields (ex:{'foo': '.2e', 'bar':
lambda x: str(x)}
).default_correl_format
: same asdefault_float_format
, but applies tocorrel_format
show_nv
: show nominal valuesshow_se
: show standard errorsshow_correl
: show correlations for any given field (ex:correl_X
)show_mixed_correl
: show correlations between different fields (ex:correl_X_Y
)align
: right-align (>
), left-align (<
), or don't align (empty string) CSV valuesatol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrixrtol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
Example
from correldata import _uc
from correldata import _np
from correldata import *
X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
data = dict(X=X, Y=Y, Z=X+Y)
print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
# yields:
#
# X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, ,
# 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0
# 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0
# 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8
460def save_data_to_file(data, filename, **kwargs): 461 ''' 462 Write correlated data to a CSV file. 463 464 **Arguments** 465 - `data`: dict of arrays with strings, floats or correlated data 466 - `filename`: `str` or path to the file to read from 467 - `kwargs`: passed to correldata.data_string() 468 ''' 469 with open(filename, 'w') as fid: 470 return fid.write(data_string(data, **kwargs))
Write correlated data to a CSV file.
Arguments
data
: dict of arrays with strings, floats or correlated datafilename
:str
or path to the file to read fromkwargs
: passed to data_string()