correldata
Read/write vectors of correlated data from/to a csv file.
These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.
1""" 2Read/write vectors of correlated data from/to a csv file. 3 4These data are stored in a dictionary, whose values are numpy arrays 5with elements which may be strings, floats, or floats with associated uncertainties 6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library. 7""" 8 9 10__author__ = 'Mathieu Daëron' 11__contact__ = 'mathieu@daeron.fr' 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron' 13__license__ = 'MIT License - https://opensource.org/licenses/MIT' 14__date__ = '2024-10-17' 15__version__ = '1.4.0' 16 17 18import os as _os 19import numpy as _np 20import uncertainties as _uc 21 22from typing import Callable, Hashable, Any 23from uncertainties.unumpy import nominal_values as nv 24 25nv = nv 26"""Alias for [`uncertainties.unumpy.nominal_values()`](https://pythonhosted.org/uncertainties/numpy_guide.html#uncertainties-and-nominal-values)""" 27 28class uarray(_np.ndarray): 29 30 __doc__ = """ 31 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 32 of [ufloat](https://pypi.org/project/uncertainties) values 33 """ 34 35 def __new__(cls, a): 36 obj = _np.asarray(a).view(cls) 37 return obj 38 39 @property 40 def nv(self): 41 """Return the array of nominal values (read-only).""" 42 return _uc.unumpy.nominal_values(_np.array(self)) 43 44 @property 45 def se(self): 46 """Return the array of standard errors (read-only)""" 47 return _uc.unumpy.std_devs(_np.array(self)) 48 49 @property 50 def correl(self): 51 """Return the correlation matrix of the array elements (read-only)""" 52 return _np.array(_uc.correlation_matrix(self)) 53 54 @property 55 def covar(self): 56 """Return the covariance matrix of the array elements (read-only)""" 57 return _np.array(_uc.covariance_matrix(self)) 58 59 @property 60 def mahalanobis(self): 61 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 62 flatself = self.n.flatten().reshape((1, self.size)) 63 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0] 64 65 n = nv 66 "Alias for `uarray.nv`" 67 68 s = se 69 "Alias for `uarray.se`" 70 71 cor = correl 72 "Alias for `uarray.correl`" 73 74 cov = covar 75 "Alias for `uarray.covar`" 76 77 m = mahalanobis 78 "Alias for `uarray.mahalanobis`" 79 80 81def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 82 ''' 83 Test whether 2-D array `M` is symmetric and positive semidefinite. 84 ''' 85 ev = _np.linalg.eigvals(M) 86 return ( 87 _np.allclose(M, M.T) # M is symmetric 88 and _np.all( 89 (ev > 0) | _np.isclose(ev, 0) 90 ) # all eignevalues are either real and strictly positive or close to zero 91 ) 92 93 94def smart_type(s: str) -> (int | float | str): 95 ''' 96 Tries to convert string `s` to an `int`, or to an `float` if that fails. 97 If both fail, return the original string unchanged. 98 ''' 99 try: return int(s) 100 except: pass 101 try: return float(s) 102 except: pass 103 return s 104 105 106def read_data(data: str, sep: str = ',', validate_covar: bool = True): 107 ''' 108 Read correlated data from a CSV-like string. 109 110 Column names are interpreted in the following way: 111 * In most cases, each columns is converted to a dict value, with the corresponding 112 dict key being the column's label. 113 * Columns whose label starts with `SE` are interpreted as specifying the standard 114 error for the latest preceding data column. 115 * Columns whose label starts with `correl` are interpreted as specifying the 116 correlation matrix for the latest preceding data column. In that case, column labels 117 are ignored for the rest of the columns belonging to this matrix. 118 * Columns whose label starts with `covar` are interpreted as specifying the 119 covariance matrix for the latest preceding data column. In that case, column labels 120 are ignored for the rest of the columns belonging to this matrix. 121 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 122 the latest preceding data column, by adding an underscore followed by the variable's 123 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 124 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 125 underscore followed by the two variable labels, joined by a second underscore 126 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 127 correspond, respectively, to the lines and columns of this matrix. 128 * Exceptions will be raised, for any given variable: 129 - when specifying both `covar` and any combination of (`SE`, `correl`) 130 - when specifying `correl` without `SE` 131 132 **Arguments** 133 - `data`: a CSV-like string 134 - `sep`: the CSV separator 135 - `validate_covar`: whether to check that the overall covariance matrix 136 is symmetric and positive semidefinite. Specifying `validate_covar = False` 137 bypasses this computationally expensive step. 138 139 **Example** 140 ```py 141 import correldata 142 data = """ 143 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 144 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 145 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 146 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 147 """[1:-1] 148 print(correldata.read_data(data)) 149 150 # yields: 151 # 152 # > { 153 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 154 # 'Tacid': array([90., 90., 90.]), 155 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 156 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 157 # } 158 ``` 159 ''' 160 161 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 162 N = len(data) - 1 163 164 values, se, correl, covar = {}, {}, {}, {} 165 j = 0 166 while j < len(data[0]): 167 field = data[0][j] 168 if not ( 169 field.startswith('SE_') 170 or field.startswith('correl_') 171 or field.startswith('covar_') 172 or field == 'SE' 173 or field == 'correl' 174 or field == 'covar' 175 or len(field) == 0 176 ): 177 values[field] = _np.array([l[j] for l in data[1:]]) 178 j += 1 179 oldfield = field 180 elif field.startswith('SE_'): 181 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 182 j += 1 183 elif field == 'SE': 184 se[oldfield] = _np.array([l[j] for l in data[1:]]) 185 j += 1 186 elif field.startswith('correl_'): 187 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 188 j += N 189 elif field == 'correl': 190 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 191 j += N 192 elif field.startswith('covar_'): 193 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 194 j += N 195 elif field == 'covar': 196 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 197 j += N 198 199 nakedvalues = {} 200 for k in [_ for _ in values]: 201 if ( 202 k not in se 203 and k not in correl 204 and k not in covar 205 ): 206 nakedvalues[k] = values.pop(k) 207 208 for x in values: 209 if x in covar: 210 if x in se: 211 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 212 if x in correl: 213 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 214 if x in correl: 215 if x not in se: 216 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 217 218 for x in correl: 219 if x in values: 220 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 221 else: 222 for x1 in values: 223 for x2 in values: 224 if x == f'{x1}_{x2}': 225 if x1 in se: 226 se1 = se[x1] 227 else: 228 if x1 in covar: 229 se1 = _np.diag(covar[x1])**0.5 230 else: 231 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 232 if x2 in se: 233 se2 = se[x2] 234 else: 235 if x2 in covar: 236 se2 = _np.diag(covar[x2])**0.5 237 else: 238 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 239 240 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 241 242 for x in se: 243 if x in values and x not in correl: 244 covar[x] = _np.diag(se[x]**2) 245 246 for k in [_ for _ in covar]: 247 if k not in values: 248 for j1 in values: 249 for j2 in values: 250 if k == f'{j1}_{j2}': 251 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 252 253 X = _np.array([_ for k in values for _ in values[k]]) 254 CM = _np.zeros((X.size, X.size)) 255 for i, vi in enumerate(values): 256 for j, vj in enumerate(values): 257 if vi == vj: 258 if vi in covar: 259 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 260 else: 261 if f'{vi}_{vj}' in covar: 262 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 263 264 s = _np.diag(CM)**.5 265 s[s==0] = 1. 266 invs = _np.diag(s**-1) 267 268 if ( 269 validate_covar 270 and not ( 271 is_symmetric_positive_semidefinite(CM) 272 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 273 ) 274 ): 275 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 276 277 corvalues = uarray(_uc.correlated_values(X, CM)) 278 279 allvalues = nakedvalues 280 281 for i, x in enumerate(values): 282 allvalues[x] = corvalues[i*N:i*N+N] 283 284 return allvalues 285 286 287def read_data_from_file(filename: str | _os.PathLike, **kwargs): 288 ''' 289 Read correlated data from a CSV file. 290 291 **Arguments** 292 - `filename`: `str` or path to the file to read from 293 - `kwargs`: passed to correldata.read_data() 294 ''' 295 with open(filename) as fid: 296 return read_data(fid.read(), **kwargs) 297 298 299def f2s( 300 x: Any, 301 f: (str | Callable | dict), 302 k: Hashable = None, 303 fb: (str | Callable) = 'z.6g', 304) -> str: 305 ''' 306 Format `x` according to format `f` 307 308 * If `f` is a string, return `f'{x:{f}}'` 309 * If `f` is a callable, return `f(x)` 310 * If `f` is a dict and optional argument `k` is a hashable, 311 return f2s(x, f[k]), otherwise return f2s(x, fb) 312 ''' 313 if isinstance (x, str): 314 return x 315 if isinstance (f, str): 316 return f'{x:{f}}' 317 if isinstance (f, Callable): 318 return f(x) 319 if isinstance (f, dict): 320 if k in f: 321 return f2s(x, f[k]) 322 if isinstance (fb, str): 323 return f'{x:{fb}}' 324 if isinstance (fb, Callable): 325 return fb(x) 326 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.') 327 328 329 330def data_string( 331 data: dict, 332 sep: str = ',', 333 include_fields: list = None, 334 exclude_fields: list = [], 335 float_format: (str | dict | Callable) = 'z.6g', 336 correl_format: (str | dict | Callable) = 'z.6f', 337 default_float_format: (str | Callable) = 'z.6g', 338 default_correl_format: (str | Callable) = 'z.6f', 339 show_nv: bool = True, 340 show_se: bool = True, 341 show_correl: bool = True, 342 show_mixed_correl: bool = True, 343 align: str = '>', 344 atol: float = 1e-12, 345 rtol: float = 1e-12, 346): 347 ''' 348 Generate CSV-like string from correlated data 349 350 **Arguments** 351 - `data`: dict of arrays with strings, floats or correlated data 352 - `sep`: the CSV separator 353 - `include_fields`: subset of fields to write; if `None`, write all fields 354 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 355 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 356 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 357 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 358 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 359 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 360 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 361 missing from `float_format.keys()` will use `default_float_format` instead. 362 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 363 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 364 - `show_nv`: show nominal values 365 - `show_se`: show standard errors 366 - `show_correl`: show correlations for any given field (ex: `correl_X`) 367 - `show_mixed_correl`: show correlations between different fields (ex: `correl_X_Y`) 368 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 369 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 370 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 371 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 372 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 373 374 375 **Example** 376 377 ```py 378 from correldata import _uc 379 from correldata import _np 380 from correldata import * 381 382 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 383 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 384 385 data = dict(X=X, Y=Y, Z=X+Y) 386 387 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 388 389 # yields: 390 # 391 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 392 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 393 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 394 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 395 ``` 396 ''' 397 if include_fields is None: 398 include_fields = [_ for _ in data] 399 cols, ufields = [], [] 400 for f in include_fields: 401 if f in exclude_fields: 402 continue 403 if isinstance(data[f], uarray): 404 ufields.append(f) 405 N = data[f].size 406 if show_nv: 407 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 408 if show_se and (f'SE_{f}' not in exclude_fields): 409 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 410 if show_correl and (f'correl_{f}' not in exclude_fields): 411 CM = _uc.correlation_matrix(data[f]) 412 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 413 for i in range(N): 414 cols.append( 415 ['' if i else f'correl_{f}'] 416 + [ 417 f2s( 418 CM[i,j], 419 correl_format, 420 f, 421 default_correl_format, 422 ) 423 for j in range(N) 424 ] 425 ) 426 elif show_nv: 427 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 428 429 if show_mixed_correl: 430 for i in range(len(ufields)): 431 for j in range(i): 432 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 433 continue 434 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 435 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 436 for k in range(N): 437 cols.append( 438 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 439 + [ 440 f2s( 441 CM[k,l], 442 correl_format, 443 f, 444 default_correl_format, 445 ) 446 for l in range(N) 447 ] 448 ) 449 450 lines = list(map(list, zip(*cols))) 451 452 if align: 453 lengths = [max([len(e) for e in l]) for l in cols] 454 for l in lines: 455 for k,ln in enumerate(lengths): 456 l[k] = f'{l[k]:{align}{ln}s}' 457 return '\n'.join([(sep+' ').join(l) for l in lines]) 458 459 return '\n'.join([sep.join(l) for l in lines]) 460 461 462 463def save_data_to_file(data, filename, **kwargs): 464 ''' 465 Write correlated data to a CSV file. 466 467 **Arguments** 468 - `data`: dict of arrays with strings, floats or correlated data 469 - `filename`: `str` or path to the file to read from 470 - `kwargs`: passed to correldata.data_string() 471 ''' 472 with open(filename, 'w') as fid: 473 return fid.write(data_string(data, **kwargs))
81def nominal_values(arr): 82 """ 83 Return the nominal values of the numbers in NumPy array arr. 84 85 Elements that are not numbers with uncertainties (derived from a 86 class from this module) are passed through untouched (because a 87 numpy.array can contain numbers with uncertainties and pure floats 88 simultaneously). 89 90 If arr is of type unumpy.matrix, the returned array is a 91 numpy.matrix, because the resulting matrix does not contain 92 numbers with uncertainties. 93 """ 94 95 return unumpy_to_numpy_matrix(to_nominal_values(arr))
Alias for uncertainties.unumpy.nominal_values()
29class uarray(_np.ndarray): 30 31 __doc__ = """ 32 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 33 of [ufloat](https://pypi.org/project/uncertainties) values 34 """ 35 36 def __new__(cls, a): 37 obj = _np.asarray(a).view(cls) 38 return obj 39 40 @property 41 def nv(self): 42 """Return the array of nominal values (read-only).""" 43 return _uc.unumpy.nominal_values(_np.array(self)) 44 45 @property 46 def se(self): 47 """Return the array of standard errors (read-only)""" 48 return _uc.unumpy.std_devs(_np.array(self)) 49 50 @property 51 def correl(self): 52 """Return the correlation matrix of the array elements (read-only)""" 53 return _np.array(_uc.correlation_matrix(self)) 54 55 @property 56 def covar(self): 57 """Return the covariance matrix of the array elements (read-only)""" 58 return _np.array(_uc.covariance_matrix(self)) 59 60 @property 61 def mahalanobis(self): 62 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 63 flatself = self.n.flatten().reshape((1, self.size)) 64 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0] 65 66 n = nv 67 "Alias for `uarray.nv`" 68 69 s = se 70 "Alias for `uarray.se`" 71 72 cor = correl 73 "Alias for `uarray.correl`" 74 75 cov = covar 76 "Alias for `uarray.covar`" 77 78 m = mahalanobis 79 "Alias for `uarray.mahalanobis`"
40 @property 41 def nv(self): 42 """Return the array of nominal values (read-only).""" 43 return _uc.unumpy.nominal_values(_np.array(self))
Return the array of nominal values (read-only).
45 @property 46 def se(self): 47 """Return the array of standard errors (read-only)""" 48 return _uc.unumpy.std_devs(_np.array(self))
Return the array of standard errors (read-only)
50 @property 51 def correl(self): 52 """Return the correlation matrix of the array elements (read-only)""" 53 return _np.array(_uc.correlation_matrix(self))
Return the correlation matrix of the array elements (read-only)
55 @property 56 def covar(self): 57 """Return the covariance matrix of the array elements (read-only)""" 58 return _np.array(_uc.covariance_matrix(self))
Return the covariance matrix of the array elements (read-only)
60 @property 61 def mahalanobis(self): 62 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 63 flatself = self.n.flatten().reshape((1, self.size)) 64 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Return the squared Mahalanobis distance from zero of the array (read-only)
40 @property 41 def nv(self): 42 """Return the array of nominal values (read-only).""" 43 return _uc.unumpy.nominal_values(_np.array(self))
Alias for uarray.nv
45 @property 46 def se(self): 47 """Return the array of standard errors (read-only)""" 48 return _uc.unumpy.std_devs(_np.array(self))
Alias for uarray.se
50 @property 51 def correl(self): 52 """Return the correlation matrix of the array elements (read-only)""" 53 return _np.array(_uc.correlation_matrix(self))
Alias for uarray.correl
55 @property 56 def covar(self): 57 """Return the covariance matrix of the array elements (read-only)""" 58 return _np.array(_uc.covariance_matrix(self))
Alias for uarray.covar
60 @property 61 def mahalanobis(self): 62 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 63 flatself = self.n.flatten().reshape((1, self.size)) 64 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Alias for uarray.mahalanobis
Inherited Members
- numpy.ndarray
- dumps
- dump
- all
- any
- argmax
- argmin
- argpartition
- argsort
- astype
- byteswap
- choose
- clip
- compress
- conj
- conjugate
- copy
- cumprod
- cumsum
- diagonal
- dot
- fill
- flatten
- getfield
- item
- max
- mean
- min
- nonzero
- partition
- prod
- put
- ravel
- repeat
- reshape
- resize
- round
- searchsorted
- setfield
- setflags
- sort
- squeeze
- std
- sum
- swapaxes
- take
- tobytes
- tofile
- tolist
- tostring
- trace
- transpose
- var
- view
- to_device
- ndim
- flags
- shape
- strides
- data
- itemsize
- size
- nbytes
- base
- dtype
- real
- imag
- flat
- ctypes
- T
- mT
- ptp
- newbyteorder
- itemset
- device
82def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 83 ''' 84 Test whether 2-D array `M` is symmetric and positive semidefinite. 85 ''' 86 ev = _np.linalg.eigvals(M) 87 return ( 88 _np.allclose(M, M.T) # M is symmetric 89 and _np.all( 90 (ev > 0) | _np.isclose(ev, 0) 91 ) # all eignevalues are either real and strictly positive or close to zero 92 )
Test whether 2-D array M
is symmetric and positive semidefinite.
95def smart_type(s: str) -> (int | float | str): 96 ''' 97 Tries to convert string `s` to an `int`, or to an `float` if that fails. 98 If both fail, return the original string unchanged. 99 ''' 100 try: return int(s) 101 except: pass 102 try: return float(s) 103 except: pass 104 return s
Tries to convert string s
to an int
, or to an float
if that fails.
If both fail, return the original string unchanged.
107def read_data(data: str, sep: str = ',', validate_covar: bool = True): 108 ''' 109 Read correlated data from a CSV-like string. 110 111 Column names are interpreted in the following way: 112 * In most cases, each columns is converted to a dict value, with the corresponding 113 dict key being the column's label. 114 * Columns whose label starts with `SE` are interpreted as specifying the standard 115 error for the latest preceding data column. 116 * Columns whose label starts with `correl` are interpreted as specifying the 117 correlation matrix for the latest preceding data column. In that case, column labels 118 are ignored for the rest of the columns belonging to this matrix. 119 * Columns whose label starts with `covar` are interpreted as specifying the 120 covariance matrix for the latest preceding data column. In that case, column labels 121 are ignored for the rest of the columns belonging to this matrix. 122 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 123 the latest preceding data column, by adding an underscore followed by the variable's 124 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 125 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 126 underscore followed by the two variable labels, joined by a second underscore 127 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 128 correspond, respectively, to the lines and columns of this matrix. 129 * Exceptions will be raised, for any given variable: 130 - when specifying both `covar` and any combination of (`SE`, `correl`) 131 - when specifying `correl` without `SE` 132 133 **Arguments** 134 - `data`: a CSV-like string 135 - `sep`: the CSV separator 136 - `validate_covar`: whether to check that the overall covariance matrix 137 is symmetric and positive semidefinite. Specifying `validate_covar = False` 138 bypasses this computationally expensive step. 139 140 **Example** 141 ```py 142 import correldata 143 data = """ 144 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 145 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 146 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 147 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 148 """[1:-1] 149 print(correldata.read_data(data)) 150 151 # yields: 152 # 153 # > { 154 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 155 # 'Tacid': array([90., 90., 90.]), 156 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 157 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 158 # } 159 ``` 160 ''' 161 162 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 163 N = len(data) - 1 164 165 values, se, correl, covar = {}, {}, {}, {} 166 j = 0 167 while j < len(data[0]): 168 field = data[0][j] 169 if not ( 170 field.startswith('SE_') 171 or field.startswith('correl_') 172 or field.startswith('covar_') 173 or field == 'SE' 174 or field == 'correl' 175 or field == 'covar' 176 or len(field) == 0 177 ): 178 values[field] = _np.array([l[j] for l in data[1:]]) 179 j += 1 180 oldfield = field 181 elif field.startswith('SE_'): 182 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 183 j += 1 184 elif field == 'SE': 185 se[oldfield] = _np.array([l[j] for l in data[1:]]) 186 j += 1 187 elif field.startswith('correl_'): 188 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 189 j += N 190 elif field == 'correl': 191 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 192 j += N 193 elif field.startswith('covar_'): 194 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 195 j += N 196 elif field == 'covar': 197 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 198 j += N 199 200 nakedvalues = {} 201 for k in [_ for _ in values]: 202 if ( 203 k not in se 204 and k not in correl 205 and k not in covar 206 ): 207 nakedvalues[k] = values.pop(k) 208 209 for x in values: 210 if x in covar: 211 if x in se: 212 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 213 if x in correl: 214 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 215 if x in correl: 216 if x not in se: 217 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 218 219 for x in correl: 220 if x in values: 221 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 222 else: 223 for x1 in values: 224 for x2 in values: 225 if x == f'{x1}_{x2}': 226 if x1 in se: 227 se1 = se[x1] 228 else: 229 if x1 in covar: 230 se1 = _np.diag(covar[x1])**0.5 231 else: 232 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 233 if x2 in se: 234 se2 = se[x2] 235 else: 236 if x2 in covar: 237 se2 = _np.diag(covar[x2])**0.5 238 else: 239 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 240 241 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 242 243 for x in se: 244 if x in values and x not in correl: 245 covar[x] = _np.diag(se[x]**2) 246 247 for k in [_ for _ in covar]: 248 if k not in values: 249 for j1 in values: 250 for j2 in values: 251 if k == f'{j1}_{j2}': 252 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 253 254 X = _np.array([_ for k in values for _ in values[k]]) 255 CM = _np.zeros((X.size, X.size)) 256 for i, vi in enumerate(values): 257 for j, vj in enumerate(values): 258 if vi == vj: 259 if vi in covar: 260 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 261 else: 262 if f'{vi}_{vj}' in covar: 263 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 264 265 s = _np.diag(CM)**.5 266 s[s==0] = 1. 267 invs = _np.diag(s**-1) 268 269 if ( 270 validate_covar 271 and not ( 272 is_symmetric_positive_semidefinite(CM) 273 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 274 ) 275 ): 276 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 277 278 corvalues = uarray(_uc.correlated_values(X, CM)) 279 280 allvalues = nakedvalues 281 282 for i, x in enumerate(values): 283 allvalues[x] = corvalues[i*N:i*N+N] 284 285 return allvalues
Read correlated data from a CSV-like string.
Column names are interpreted in the following way:
- In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
- Columns whose label starts with
SE
are interpreted as specifying the standard error for the latest preceding data column. - Columns whose label starts with
correl
are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. - Columns whose label starts with
covar
are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. SE
,correl
, andcovar
may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex:SE_foo
,correl_bar
,covar_baz
).correl
, andcovar
may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex:correl_foo_bar
,covar_X_Y
). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.- Exceptions will be raised, for any given variable:
- when specifying both
covar
and any combination of (SE
,correl
) - when specifying
correl
withoutSE
- when specifying both
Arguments
data
: a CSV-like stringsep
: the CSV separatorvalidate_covar
: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifyingvalidate_covar = False
bypasses this computationally expensive step.
Example
import correldata
data = """
Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48
FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0
BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0
BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5
"""[1:-1]
print(read_data(data))
# yields:
#
# > {
# 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
# 'Tacid': array([90., 90., 90.]),
# 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
# 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
# }
288def read_data_from_file(filename: str | _os.PathLike, **kwargs): 289 ''' 290 Read correlated data from a CSV file. 291 292 **Arguments** 293 - `filename`: `str` or path to the file to read from 294 - `kwargs`: passed to correldata.read_data() 295 ''' 296 with open(filename) as fid: 297 return read_data(fid.read(), **kwargs)
Read correlated data from a CSV file.
Arguments
filename
:str
or path to the file to read fromkwargs
: passed to read_data()
300def f2s( 301 x: Any, 302 f: (str | Callable | dict), 303 k: Hashable = None, 304 fb: (str | Callable) = 'z.6g', 305) -> str: 306 ''' 307 Format `x` according to format `f` 308 309 * If `f` is a string, return `f'{x:{f}}'` 310 * If `f` is a callable, return `f(x)` 311 * If `f` is a dict and optional argument `k` is a hashable, 312 return f2s(x, f[k]), otherwise return f2s(x, fb) 313 ''' 314 if isinstance (x, str): 315 return x 316 if isinstance (f, str): 317 return f'{x:{f}}' 318 if isinstance (f, Callable): 319 return f(x) 320 if isinstance (f, dict): 321 if k in f: 322 return f2s(x, f[k]) 323 if isinstance (fb, str): 324 return f'{x:{fb}}' 325 if isinstance (fb, Callable): 326 return fb(x) 327 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
Format x
according to format f
- If
f
is a string, returnf'{x:{f}}'
- If
f
is a callable, returnf(x)
- If
f
is a dict and optional argumentk
is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
331def data_string( 332 data: dict, 333 sep: str = ',', 334 include_fields: list = None, 335 exclude_fields: list = [], 336 float_format: (str | dict | Callable) = 'z.6g', 337 correl_format: (str | dict | Callable) = 'z.6f', 338 default_float_format: (str | Callable) = 'z.6g', 339 default_correl_format: (str | Callable) = 'z.6f', 340 show_nv: bool = True, 341 show_se: bool = True, 342 show_correl: bool = True, 343 show_mixed_correl: bool = True, 344 align: str = '>', 345 atol: float = 1e-12, 346 rtol: float = 1e-12, 347): 348 ''' 349 Generate CSV-like string from correlated data 350 351 **Arguments** 352 - `data`: dict of arrays with strings, floats or correlated data 353 - `sep`: the CSV separator 354 - `include_fields`: subset of fields to write; if `None`, write all fields 355 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 356 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 357 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 358 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 359 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 360 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 361 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 362 missing from `float_format.keys()` will use `default_float_format` instead. 363 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 364 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 365 - `show_nv`: show nominal values 366 - `show_se`: show standard errors 367 - `show_correl`: show correlations for any given field (ex: `correl_X`) 368 - `show_mixed_correl`: show correlations between different fields (ex: `correl_X_Y`) 369 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 370 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 371 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 372 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 373 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 374 375 376 **Example** 377 378 ```py 379 from correldata import _uc 380 from correldata import _np 381 from correldata import * 382 383 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 384 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 385 386 data = dict(X=X, Y=Y, Z=X+Y) 387 388 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 389 390 # yields: 391 # 392 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 393 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 394 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 395 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 396 ``` 397 ''' 398 if include_fields is None: 399 include_fields = [_ for _ in data] 400 cols, ufields = [], [] 401 for f in include_fields: 402 if f in exclude_fields: 403 continue 404 if isinstance(data[f], uarray): 405 ufields.append(f) 406 N = data[f].size 407 if show_nv: 408 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 409 if show_se and (f'SE_{f}' not in exclude_fields): 410 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 411 if show_correl and (f'correl_{f}' not in exclude_fields): 412 CM = _uc.correlation_matrix(data[f]) 413 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 414 for i in range(N): 415 cols.append( 416 ['' if i else f'correl_{f}'] 417 + [ 418 f2s( 419 CM[i,j], 420 correl_format, 421 f, 422 default_correl_format, 423 ) 424 for j in range(N) 425 ] 426 ) 427 elif show_nv: 428 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 429 430 if show_mixed_correl: 431 for i in range(len(ufields)): 432 for j in range(i): 433 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 434 continue 435 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 436 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 437 for k in range(N): 438 cols.append( 439 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 440 + [ 441 f2s( 442 CM[k,l], 443 correl_format, 444 f, 445 default_correl_format, 446 ) 447 for l in range(N) 448 ] 449 ) 450 451 lines = list(map(list, zip(*cols))) 452 453 if align: 454 lengths = [max([len(e) for e in l]) for l in cols] 455 for l in lines: 456 for k,ln in enumerate(lengths): 457 l[k] = f'{l[k]:{align}{ln}s}' 458 return '\n'.join([(sep+' ').join(l) for l in lines]) 459 460 return '\n'.join([sep.join(l) for l in lines])
Generate CSV-like string from correlated data
Arguments
data
: dict of arrays with strings, floats or correlated datasep
: the CSV separatorinclude_fields
: subset of fields to write; ifNone
, write all fieldsexclude_fields
: subset of fields to ignore (takes precedence overinclude_fields
); to exclude only the SE for fieldfoo
, includeSE_foo
; same goes forcorrel_foo
float_format
: formatting for float values. May be a string (ex:'z.3f'
), a callable (ex:lambda x: '.2f' if x else '0'
), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex:{'foo': '.2e', 'bar': (lambda x: str(x))}
).correl_format
: same asfloat_format
, but applies to correlation matrix elementsdefault_float_format
: only used whenfloat_format
is a dict; in that case, fields missing fromfloat_format.keys()
will usedefault_float_format
instead. corresponding to different fields (ex:{'foo': '.2e', 'bar':
lambda x: str(x)}
).default_correl_format
: same asdefault_float_format
, but applies tocorrel_format
show_nv
: show nominal valuesshow_se
: show standard errorsshow_correl
: show correlations for any given field (ex:correl_X
)show_mixed_correl
: show correlations between different fields (ex:correl_X_Y
)align
: right-align (>
), left-align (<
), or don't align (empty string) CSV valuesatol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrixrtol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
Example
from correldata import _uc
from correldata import _np
from correldata import *
X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
data = dict(X=X, Y=Y, Z=X+Y)
print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
# yields:
#
# X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, ,
# 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0
# 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0
# 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8
464def save_data_to_file(data, filename, **kwargs): 465 ''' 466 Write correlated data to a CSV file. 467 468 **Arguments** 469 - `data`: dict of arrays with strings, floats or correlated data 470 - `filename`: `str` or path to the file to read from 471 - `kwargs`: passed to correldata.data_string() 472 ''' 473 with open(filename, 'w') as fid: 474 return fid.write(data_string(data, **kwargs))
Write correlated data to a CSV file.
Arguments
data
: dict of arrays with strings, floats or correlated datafilename
:str
or path to the file to read fromkwargs
: passed to data_string()