correldata
Read/write vectors of correlated data from/to a csv file.
These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.
1""" 2Read/write vectors of correlated data from/to a csv file. 3 4These data are stored in a dictionary, whose values are numpy arrays 5with elements which may be strings, floats, or floats with associated uncertainties 6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library. 7""" 8 9 10__author__ = 'Mathieu Daëron' 11__contact__ = 'mathieu@daeron.fr' 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron' 13__license__ = 'MIT License - https://opensource.org/licenses/MIT' 14__date__ = '2024-10-15' 15__version__ = '1.3.0' 16 17 18import os as _os 19import numpy as _np 20import uncertainties as _uc 21 22from typing import Callable, Hashable, Any 23 24class uarray(_np.ndarray): 25 26 __doc__ = """ 27 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 28 of [ufloat](https://pypi.org/project/uncertainties) values 29 """ 30 31 def __new__(cls, a): 32 obj = _np.asarray(a).view(cls) 33 return obj 34 35 @property 36 def nv(self): 37 """Return the array of nominal values (read-only).""" 38 return _uc.unumpy.nominal_values(_np.array(self)) 39 40 @property 41 def se(self): 42 """Return the array of standard errors (read-only)""" 43 return _uc.unumpy.std_devs(_np.array(self)) 44 45 @property 46 def correl(self): 47 """Return the correlation matrix of the array elements (read-only)""" 48 return _np.array(_uc.correlation_matrix(self)) 49 50 @property 51 def covar(self): 52 """Return the covariance matrix of the array elements (read-only)""" 53 return _np.array(_uc.covariance_matrix(self)) 54 55 @property 56 def mahalanobis(self): 57 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 58 flatself = self.n.flatten().reshape((1, self.size)) 59 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0] 60 61 n = nv 62 "Alias for `uarray.nv`" 63 64 s = se 65 "Alias for `uarray.se`" 66 67 cor = correl 68 "Alias for `uarray.correl`" 69 70 cov = covar 71 "Alias for `uarray.covar`" 72 73 m = mahalanobis 74 "Alias for `uarray.mahalanobis`" 75 76 77def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 78 ''' 79 Test whether 2-D array `M` is symmetric and positive semidefinite. 80 ''' 81 ev = _np.linalg.eigvals(M) 82 return ( 83 _np.allclose(M, M.T) # M is symmetric 84 and _np.all( 85 (ev > 0) | _np.isclose(ev, 0) 86 ) # all eignevalues are either real and strictly positive or close to zero 87 ) 88 89 90def smart_type(s: str) -> (int | float | str): 91 ''' 92 Tries to convert string `s` to an `int`, or to an `float` if that fails. 93 If both fail, return the original string unchanged. 94 ''' 95 try: return int(s) 96 except: pass 97 try: return float(s) 98 except: pass 99 return s 100 101 102def read_data(data: str, sep: str = ',', validate_covar: bool = True): 103 ''' 104 Read correlated data from a CSV-like string. 105 106 Column names are interpreted in the following way: 107 * In most cases, each columns is converted to a dict value, with the corresponding 108 dict key being the column's label. 109 * Columns whose label starts with `SE` are interpreted as specifying the standard 110 error for the latest preceding data column. 111 * Columns whose label starts with `correl` are interpreted as specifying the 112 correlation matrix for the latest preceding data column. In that case, column labels 113 are ignored for the rest of the columns belonging to this matrix. 114 * Columns whose label starts with `covar` are interpreted as specifying the 115 covariance matrix for the latest preceding data column. In that case, column labels 116 are ignored for the rest of the columns belonging to this matrix. 117 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 118 the latest preceding data column, by adding an underscore followed by the variable's 119 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 120 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 121 underscore followed by the two variable labels, joined by a second underscore 122 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 123 correspond, respectively, to the lines and columns of this matrix. 124 * Exceptions will be raised, for any given variable: 125 - when specifying both `covar` and any combination of (`SE`, `correl`) 126 - when specifying `correl` without `SE` 127 128 **Arguments** 129 - `data`: a CSV-like string 130 - `sep`: the CSV separator 131 - `validate_covar`: whether to check that the overall covariance matrix 132 is symmetric and positive semidefinite. Specifying `validate_covar = False` 133 bypasses this computationally expensive step. 134 135 **Example** 136 ```py 137 import correldata 138 data = """ 139 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 140 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 141 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 142 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 143 """[1:-1] 144 print(correldata.read_data(data)) 145 146 # yields: 147 # 148 # > { 149 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 150 # 'Tacid': array([90., 90., 90.]), 151 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 152 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 153 # } 154 ``` 155 ''' 156 157 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 158 N = len(data) - 1 159 160 values, se, correl, covar = {}, {}, {}, {} 161 j = 0 162 while j < len(data[0]): 163 field = data[0][j] 164 if not ( 165 field.startswith('SE_') 166 or field.startswith('correl_') 167 or field.startswith('covar_') 168 or field == 'SE' 169 or field == 'correl' 170 or field == 'covar' 171 or len(field) == 0 172 ): 173 values[field] = _np.array([l[j] for l in data[1:]]) 174 j += 1 175 oldfield = field 176 elif field.startswith('SE_'): 177 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 178 j += 1 179 elif field == 'SE': 180 se[oldfield] = _np.array([l[j] for l in data[1:]]) 181 j += 1 182 elif field.startswith('correl_'): 183 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 184 j += N 185 elif field == 'correl': 186 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 187 j += N 188 elif field.startswith('covar_'): 189 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 190 j += N 191 elif field == 'covar': 192 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 193 j += N 194 195 nakedvalues = {} 196 for k in [_ for _ in values]: 197 if ( 198 k not in se 199 and k not in correl 200 and k not in covar 201 ): 202 nakedvalues[k] = values.pop(k) 203 204 for x in values: 205 if x in covar: 206 if x in se: 207 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 208 if x in correl: 209 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 210 if x in correl: 211 if x not in se: 212 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 213 214 for x in correl: 215 if x in values: 216 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 217 else: 218 for x1 in values: 219 for x2 in values: 220 if x == f'{x1}_{x2}': 221 if x1 in se: 222 se1 = se[x1] 223 else: 224 if x1 in covar: 225 se1 = _np.diag(covar[x1])**0.5 226 else: 227 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 228 if x2 in se: 229 se2 = se[x2] 230 else: 231 if x2 in covar: 232 se2 = _np.diag(covar[x2])**0.5 233 else: 234 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 235 236 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 237 238 for x in se: 239 if x in values and x not in correl: 240 covar[x] = _np.diag(se[x]**2) 241 242 for k in [_ for _ in covar]: 243 if k not in values: 244 for j1 in values: 245 for j2 in values: 246 if k == f'{j1}_{j2}': 247 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 248 249 X = _np.array([_ for k in values for _ in values[k]]) 250 CM = _np.zeros((X.size, X.size)) 251 for i, vi in enumerate(values): 252 for j, vj in enumerate(values): 253 if vi == vj: 254 if vi in covar: 255 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 256 else: 257 if f'{vi}_{vj}' in covar: 258 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 259 260 s = _np.diag(CM)**.5 261 s[s==0] = 1. 262 invs = _np.diag(s**-1) 263 264 if ( 265 validate_covar 266 and not ( 267 is_symmetric_positive_semidefinite(CM) 268 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 269 ) 270 ): 271 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 272 273 corvalues = uarray(_uc.correlated_values(X, CM)) 274 275 allvalues = nakedvalues 276 277 for i, x in enumerate(values): 278 allvalues[x] = corvalues[i*N:i*N+N] 279 280 return allvalues 281 282 283def read_data_from_file(filename: str | _os.PathLike, **kwargs): 284 ''' 285 Read correlated data from a CSV file. 286 287 **Arguments** 288 - `filename`: `str` or path to the file to read from 289 - `kwargs`: passed to correldata.read_data() 290 ''' 291 with open(filename) as fid: 292 return read_data(fid.read(), **kwargs) 293 294 295def f2s( 296 x: Any, 297 f: (str | Callable | dict), 298 k: Hashable = None, 299 fb: (str | Callable) = 'z.6g', 300) -> str: 301 ''' 302 Format `x` according to format `f` 303 304 * If `f` is a string, return `f'{x:{f}}'` 305 * If `f` is a callable, return `f(x)` 306 * If `f` is a dict and optional argument `k` is a hashable, 307 return f2s(x, f[k]), otherwise return f2s(x, fb) 308 ''' 309 310 if isinstance (x, str): 311 return x 312 if isinstance (f, str): 313 return f'{x:{f}}' 314 if isinstance (f, Callable): 315 return f(x) 316 if isinstance (f, dict): 317 if k in f: 318 return f2s(x, f[k]) 319 if isinstance (fb, str): 320 return f'{x:{fb}}' 321 if isinstance (fb, Callable): 322 return fb(x) 323 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.') 324 325 326 327def data_string( 328 data: dict, 329 sep: str = ',', 330 include_fields: list = None, 331 exclude_fields: list = [], 332 float_format: (str | dict | Callable) = 'z.6g', 333 correl_format: (str | dict | Callable) = 'z.6f', 334 default_float_format: (str | Callable) = 'z.6g', 335 default_correl_format: (str | Callable) = 'z.6f', 336 align: str = '>', 337 atol: float = 1e-12, 338 rtol: float = 1e-12, 339): 340 ''' 341 Generate CSV-like string from correlated data 342 343 **Arguments** 344 - `data`: dict of arrays with strings, floats or correlated data 345 - `sep`: the CSV separator 346 - `include_fields`: subset of fields to write; if `None`, write all fields 347 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 348 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 349 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 350 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 351 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 352 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 353 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 354 missing from `float_format.keys()` will use `default_float_format` instead. 355 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 356 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 357 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 358 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 359 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 360 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 361 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 362 363 364 **Example** 365 366 ```py 367 from correldata import _uc 368 from correldata import _np 369 from correldata import * 370 371 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 372 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 373 374 data = dict(X=X, Y=Y, Z=X+Y) 375 376 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 377 378 # yields: 379 # 380 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 381 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 382 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 383 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 384 ``` 385 ''' 386 if include_fields is None: 387 include_fields = [_ for _ in data] 388 cols, ufields = [], [] 389 for f in include_fields: 390 if f in exclude_fields: 391 continue 392 if isinstance(data[f], uarray): 393 ufields.append(f) 394 N = data[f].size 395 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 396 if f'SE_{f}' not in exclude_fields: 397 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 398 if f'correl_{f}' not in exclude_fields: 399 CM = _uc.correlation_matrix(data[f]) 400 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 401 for i in range(N): 402 cols.append( 403 ['' if i else f'correl_{f}'] 404 + [ 405 f2s( 406 CM[i,j], 407 correl_format, 408 f, 409 default_correl_format, 410 ) 411 for j in range(N) 412 ] 413 ) 414 415 else: 416 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 417 418 for i in range(len(ufields)): 419 for j in range(i): 420 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 421 continue 422 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 423 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 424 for k in range(N): 425 cols.append( 426 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 427 + [ 428 f2s( 429 CM[k,l], 430 correl_format, 431 f, 432 default_correl_format, 433 ) 434 for l in range(N) 435 ] 436 ) 437 438 lines = list(map(list, zip(*cols))) 439 440 if align: 441 lengths = [max([len(e) for e in l]) for l in cols] 442 for l in lines: 443 for k,ln in enumerate(lengths): 444 l[k] = f'{l[k]:{align}{ln}s}' 445 return '\n'.join([(sep+' ').join(l) for l in lines]) 446 447 return '\n'.join([sep.join(l) for l in lines]) 448 449 450 451def save_data_to_file(data, filename, **kwargs): 452 ''' 453 Write correlated data to a CSV file. 454 455 **Arguments** 456 - `data`: dict of arrays with strings, floats or correlated data 457 - `filename`: `str` or path to the file to read from 458 - `kwargs`: passed to correldata.data_string() 459 ''' 460 with open(filename, 'w') as fid: 461 return fid.write(data_string(data, **kwargs))
25class uarray(_np.ndarray): 26 27 __doc__ = """ 28 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 29 of [ufloat](https://pypi.org/project/uncertainties) values 30 """ 31 32 def __new__(cls, a): 33 obj = _np.asarray(a).view(cls) 34 return obj 35 36 @property 37 def nv(self): 38 """Return the array of nominal values (read-only).""" 39 return _uc.unumpy.nominal_values(_np.array(self)) 40 41 @property 42 def se(self): 43 """Return the array of standard errors (read-only)""" 44 return _uc.unumpy.std_devs(_np.array(self)) 45 46 @property 47 def correl(self): 48 """Return the correlation matrix of the array elements (read-only)""" 49 return _np.array(_uc.correlation_matrix(self)) 50 51 @property 52 def covar(self): 53 """Return the covariance matrix of the array elements (read-only)""" 54 return _np.array(_uc.covariance_matrix(self)) 55 56 @property 57 def mahalanobis(self): 58 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 59 flatself = self.n.flatten().reshape((1, self.size)) 60 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0] 61 62 n = nv 63 "Alias for `uarray.nv`" 64 65 s = se 66 "Alias for `uarray.se`" 67 68 cor = correl 69 "Alias for `uarray.correl`" 70 71 cov = covar 72 "Alias for `uarray.covar`" 73 74 m = mahalanobis 75 "Alias for `uarray.mahalanobis`"
36 @property 37 def nv(self): 38 """Return the array of nominal values (read-only).""" 39 return _uc.unumpy.nominal_values(_np.array(self))
Return the array of nominal values (read-only).
41 @property 42 def se(self): 43 """Return the array of standard errors (read-only)""" 44 return _uc.unumpy.std_devs(_np.array(self))
Return the array of standard errors (read-only)
46 @property 47 def correl(self): 48 """Return the correlation matrix of the array elements (read-only)""" 49 return _np.array(_uc.correlation_matrix(self))
Return the correlation matrix of the array elements (read-only)
51 @property 52 def covar(self): 53 """Return the covariance matrix of the array elements (read-only)""" 54 return _np.array(_uc.covariance_matrix(self))
Return the covariance matrix of the array elements (read-only)
56 @property 57 def mahalanobis(self): 58 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 59 flatself = self.n.flatten().reshape((1, self.size)) 60 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Return the squared Mahalanobis distance from zero of the array (read-only)
36 @property 37 def nv(self): 38 """Return the array of nominal values (read-only).""" 39 return _uc.unumpy.nominal_values(_np.array(self))
Alias for uarray.nv
41 @property 42 def se(self): 43 """Return the array of standard errors (read-only)""" 44 return _uc.unumpy.std_devs(_np.array(self))
Alias for uarray.se
46 @property 47 def correl(self): 48 """Return the correlation matrix of the array elements (read-only)""" 49 return _np.array(_uc.correlation_matrix(self))
Alias for uarray.correl
51 @property 52 def covar(self): 53 """Return the covariance matrix of the array elements (read-only)""" 54 return _np.array(_uc.covariance_matrix(self))
Alias for uarray.covar
56 @property 57 def mahalanobis(self): 58 """Return the squared Mahalanobis distance from zero of the array (read-only)""" 59 flatself = self.n.flatten().reshape((1, self.size)) 60 return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Alias for uarray.mahalanobis
Inherited Members
- numpy.ndarray
- dumps
- dump
- all
- any
- argmax
- argmin
- argpartition
- argsort
- astype
- byteswap
- choose
- clip
- compress
- conj
- conjugate
- copy
- cumprod
- cumsum
- diagonal
- dot
- fill
- flatten
- getfield
- item
- max
- mean
- min
- nonzero
- partition
- prod
- put
- ravel
- repeat
- reshape
- resize
- round
- searchsorted
- setfield
- setflags
- sort
- squeeze
- std
- sum
- swapaxes
- take
- tobytes
- tofile
- tolist
- tostring
- trace
- transpose
- var
- view
- to_device
- ndim
- flags
- shape
- strides
- data
- itemsize
- size
- nbytes
- base
- dtype
- real
- imag
- flat
- ctypes
- T
- mT
- ptp
- newbyteorder
- itemset
- device
78def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 79 ''' 80 Test whether 2-D array `M` is symmetric and positive semidefinite. 81 ''' 82 ev = _np.linalg.eigvals(M) 83 return ( 84 _np.allclose(M, M.T) # M is symmetric 85 and _np.all( 86 (ev > 0) | _np.isclose(ev, 0) 87 ) # all eignevalues are either real and strictly positive or close to zero 88 )
Test whether 2-D array M
is symmetric and positive semidefinite.
91def smart_type(s: str) -> (int | float | str): 92 ''' 93 Tries to convert string `s` to an `int`, or to an `float` if that fails. 94 If both fail, return the original string unchanged. 95 ''' 96 try: return int(s) 97 except: pass 98 try: return float(s) 99 except: pass 100 return s
Tries to convert string s
to an int
, or to an float
if that fails.
If both fail, return the original string unchanged.
103def read_data(data: str, sep: str = ',', validate_covar: bool = True): 104 ''' 105 Read correlated data from a CSV-like string. 106 107 Column names are interpreted in the following way: 108 * In most cases, each columns is converted to a dict value, with the corresponding 109 dict key being the column's label. 110 * Columns whose label starts with `SE` are interpreted as specifying the standard 111 error for the latest preceding data column. 112 * Columns whose label starts with `correl` are interpreted as specifying the 113 correlation matrix for the latest preceding data column. In that case, column labels 114 are ignored for the rest of the columns belonging to this matrix. 115 * Columns whose label starts with `covar` are interpreted as specifying the 116 covariance matrix for the latest preceding data column. In that case, column labels 117 are ignored for the rest of the columns belonging to this matrix. 118 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 119 the latest preceding data column, by adding an underscore followed by the variable's 120 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 121 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 122 underscore followed by the two variable labels, joined by a second underscore 123 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 124 correspond, respectively, to the lines and columns of this matrix. 125 * Exceptions will be raised, for any given variable: 126 - when specifying both `covar` and any combination of (`SE`, `correl`) 127 - when specifying `correl` without `SE` 128 129 **Arguments** 130 - `data`: a CSV-like string 131 - `sep`: the CSV separator 132 - `validate_covar`: whether to check that the overall covariance matrix 133 is symmetric and positive semidefinite. Specifying `validate_covar = False` 134 bypasses this computationally expensive step. 135 136 **Example** 137 ```py 138 import correldata 139 data = """ 140 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 141 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 142 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 143 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 144 """[1:-1] 145 print(correldata.read_data(data)) 146 147 # yields: 148 # 149 # > { 150 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 151 # 'Tacid': array([90., 90., 90.]), 152 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 153 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 154 # } 155 ``` 156 ''' 157 158 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 159 N = len(data) - 1 160 161 values, se, correl, covar = {}, {}, {}, {} 162 j = 0 163 while j < len(data[0]): 164 field = data[0][j] 165 if not ( 166 field.startswith('SE_') 167 or field.startswith('correl_') 168 or field.startswith('covar_') 169 or field == 'SE' 170 or field == 'correl' 171 or field == 'covar' 172 or len(field) == 0 173 ): 174 values[field] = _np.array([l[j] for l in data[1:]]) 175 j += 1 176 oldfield = field 177 elif field.startswith('SE_'): 178 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 179 j += 1 180 elif field == 'SE': 181 se[oldfield] = _np.array([l[j] for l in data[1:]]) 182 j += 1 183 elif field.startswith('correl_'): 184 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 185 j += N 186 elif field == 'correl': 187 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 188 j += N 189 elif field.startswith('covar_'): 190 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 191 j += N 192 elif field == 'covar': 193 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 194 j += N 195 196 nakedvalues = {} 197 for k in [_ for _ in values]: 198 if ( 199 k not in se 200 and k not in correl 201 and k not in covar 202 ): 203 nakedvalues[k] = values.pop(k) 204 205 for x in values: 206 if x in covar: 207 if x in se: 208 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 209 if x in correl: 210 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 211 if x in correl: 212 if x not in se: 213 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 214 215 for x in correl: 216 if x in values: 217 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 218 else: 219 for x1 in values: 220 for x2 in values: 221 if x == f'{x1}_{x2}': 222 if x1 in se: 223 se1 = se[x1] 224 else: 225 if x1 in covar: 226 se1 = _np.diag(covar[x1])**0.5 227 else: 228 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 229 if x2 in se: 230 se2 = se[x2] 231 else: 232 if x2 in covar: 233 se2 = _np.diag(covar[x2])**0.5 234 else: 235 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 236 237 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 238 239 for x in se: 240 if x in values and x not in correl: 241 covar[x] = _np.diag(se[x]**2) 242 243 for k in [_ for _ in covar]: 244 if k not in values: 245 for j1 in values: 246 for j2 in values: 247 if k == f'{j1}_{j2}': 248 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 249 250 X = _np.array([_ for k in values for _ in values[k]]) 251 CM = _np.zeros((X.size, X.size)) 252 for i, vi in enumerate(values): 253 for j, vj in enumerate(values): 254 if vi == vj: 255 if vi in covar: 256 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 257 else: 258 if f'{vi}_{vj}' in covar: 259 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 260 261 s = _np.diag(CM)**.5 262 s[s==0] = 1. 263 invs = _np.diag(s**-1) 264 265 if ( 266 validate_covar 267 and not ( 268 is_symmetric_positive_semidefinite(CM) 269 or is_symmetric_positive_semidefinite(invs @ CM @ invs) 270 ) 271 ): 272 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 273 274 corvalues = uarray(_uc.correlated_values(X, CM)) 275 276 allvalues = nakedvalues 277 278 for i, x in enumerate(values): 279 allvalues[x] = corvalues[i*N:i*N+N] 280 281 return allvalues
Read correlated data from a CSV-like string.
Column names are interpreted in the following way:
- In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
- Columns whose label starts with
SE
are interpreted as specifying the standard error for the latest preceding data column. - Columns whose label starts with
correl
are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. - Columns whose label starts with
covar
are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. SE
,correl
, andcovar
may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex:SE_foo
,correl_bar
,covar_baz
).correl
, andcovar
may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex:correl_foo_bar
,covar_X_Y
). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.- Exceptions will be raised, for any given variable:
- when specifying both
covar
and any combination of (SE
,correl
) - when specifying
correl
withoutSE
- when specifying both
Arguments
data
: a CSV-like stringsep
: the CSV separatorvalidate_covar
: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifyingvalidate_covar = False
bypasses this computationally expensive step.
Example
import correldata
data = """
Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48
FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0
BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0
BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5
"""[1:-1]
print(read_data(data))
# yields:
#
# > {
# 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
# 'Tacid': array([90., 90., 90.]),
# 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
# 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
# }
284def read_data_from_file(filename: str | _os.PathLike, **kwargs): 285 ''' 286 Read correlated data from a CSV file. 287 288 **Arguments** 289 - `filename`: `str` or path to the file to read from 290 - `kwargs`: passed to correldata.read_data() 291 ''' 292 with open(filename) as fid: 293 return read_data(fid.read(), **kwargs)
Read correlated data from a CSV file.
Arguments
filename
:str
or path to the file to read fromkwargs
: passed to read_data()
296def f2s( 297 x: Any, 298 f: (str | Callable | dict), 299 k: Hashable = None, 300 fb: (str | Callable) = 'z.6g', 301) -> str: 302 ''' 303 Format `x` according to format `f` 304 305 * If `f` is a string, return `f'{x:{f}}'` 306 * If `f` is a callable, return `f(x)` 307 * If `f` is a dict and optional argument `k` is a hashable, 308 return f2s(x, f[k]), otherwise return f2s(x, fb) 309 ''' 310 311 if isinstance (x, str): 312 return x 313 if isinstance (f, str): 314 return f'{x:{f}}' 315 if isinstance (f, Callable): 316 return f(x) 317 if isinstance (f, dict): 318 if k in f: 319 return f2s(x, f[k]) 320 if isinstance (fb, str): 321 return f'{x:{fb}}' 322 if isinstance (fb, Callable): 323 return fb(x) 324 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
Format x
according to format f
- If
f
is a string, returnf'{x:{f}}'
- If
f
is a callable, returnf(x)
- If
f
is a dict and optional argumentk
is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
328def data_string( 329 data: dict, 330 sep: str = ',', 331 include_fields: list = None, 332 exclude_fields: list = [], 333 float_format: (str | dict | Callable) = 'z.6g', 334 correl_format: (str | dict | Callable) = 'z.6f', 335 default_float_format: (str | Callable) = 'z.6g', 336 default_correl_format: (str | Callable) = 'z.6f', 337 align: str = '>', 338 atol: float = 1e-12, 339 rtol: float = 1e-12, 340): 341 ''' 342 Generate CSV-like string from correlated data 343 344 **Arguments** 345 - `data`: dict of arrays with strings, floats or correlated data 346 - `sep`: the CSV separator 347 - `include_fields`: subset of fields to write; if `None`, write all fields 348 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 349 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 350 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 351 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 352 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 353 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 354 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 355 missing from `float_format.keys()` will use `default_float_format` instead. 356 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 357 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 358 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 359 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 360 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 361 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 362 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 363 364 365 **Example** 366 367 ```py 368 from correldata import _uc 369 from correldata import _np 370 from correldata import * 371 372 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 373 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 374 375 data = dict(X=X, Y=Y, Z=X+Y) 376 377 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 378 379 # yields: 380 # 381 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 382 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 383 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 384 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 385 ``` 386 ''' 387 if include_fields is None: 388 include_fields = [_ for _ in data] 389 cols, ufields = [], [] 390 for f in include_fields: 391 if f in exclude_fields: 392 continue 393 if isinstance(data[f], uarray): 394 ufields.append(f) 395 N = data[f].size 396 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 397 if f'SE_{f}' not in exclude_fields: 398 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 399 if f'correl_{f}' not in exclude_fields: 400 CM = _uc.correlation_matrix(data[f]) 401 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 402 for i in range(N): 403 cols.append( 404 ['' if i else f'correl_{f}'] 405 + [ 406 f2s( 407 CM[i,j], 408 correl_format, 409 f, 410 default_correl_format, 411 ) 412 for j in range(N) 413 ] 414 ) 415 416 else: 417 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 418 419 for i in range(len(ufields)): 420 for j in range(i): 421 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 422 continue 423 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 424 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 425 for k in range(N): 426 cols.append( 427 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 428 + [ 429 f2s( 430 CM[k,l], 431 correl_format, 432 f, 433 default_correl_format, 434 ) 435 for l in range(N) 436 ] 437 ) 438 439 lines = list(map(list, zip(*cols))) 440 441 if align: 442 lengths = [max([len(e) for e in l]) for l in cols] 443 for l in lines: 444 for k,ln in enumerate(lengths): 445 l[k] = f'{l[k]:{align}{ln}s}' 446 return '\n'.join([(sep+' ').join(l) for l in lines]) 447 448 return '\n'.join([sep.join(l) for l in lines])
Generate CSV-like string from correlated data
Arguments
data
: dict of arrays with strings, floats or correlated datasep
: the CSV separatorinclude_fields
: subset of fields to write; ifNone
, write all fieldsexclude_fields
: subset of fields to ignore (takes precedence overinclude_fields
); to exclude only the SE for fieldfoo
, includeSE_foo
; same goes forcorrel_foo
float_format
: formatting for float values. May be a string (ex:'z.3f'
), a callable (ex:lambda x: '.2f' if x else '0'
), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex:{'foo': '.2e', 'bar': (lambda x: str(x))}
).correl_format
: same asfloat_format
, but applies to correlation matrix elementsdefault_float_format
: only used whenfloat_format
is a dict; in that case, fields missing fromfloat_format.keys()
will usedefault_float_format
instead. corresponding to different fields (ex:{'foo': '.2e', 'bar':
lambda x: str(x)}
).default_correl_format
: same asdefault_float_format
, but applies tocorrel_format
align
: right-align (>
), left-align (<
), or don't align (empty string) CSV valuesatol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrixrtol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
Example
from correldata import _uc
from correldata import _np
from correldata import *
X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
data = dict(X=X, Y=Y, Z=X+Y)
print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
# yields:
#
# X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, ,
# 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0
# 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0
# 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8
452def save_data_to_file(data, filename, **kwargs): 453 ''' 454 Write correlated data to a CSV file. 455 456 **Arguments** 457 - `data`: dict of arrays with strings, floats or correlated data 458 - `filename`: `str` or path to the file to read from 459 - `kwargs`: passed to correldata.data_string() 460 ''' 461 with open(filename, 'w') as fid: 462 return fid.write(data_string(data, **kwargs))
Write correlated data to a CSV file.
Arguments
data
: dict of arrays with strings, floats or correlated datafilename
:str
or path to the file to read fromkwargs
: passed to data_string()