correldata

Read/write vectors of correlated data from/to a csv file.

These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.

  1"""
  2Read/write vectors of correlated data from/to a csv file.
  3
  4These data are stored in a dictionary, whose values are numpy arrays
  5with elements which may be strings, floats, or floats with associated uncertainties
  6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library.
  7"""
  8
  9
 10__author__    = 'Mathieu Daëron'
 11__contact__   = 'mathieu@daeron.fr'
 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron'
 13__license__   = 'MIT License - https://opensource.org/licenses/MIT'
 14__date__      = '2024-10-11'
 15__version__   = '1.2.0'
 16
 17
 18import os as _os
 19import numpy as _np
 20import uncertainties as _uc
 21
 22from typing import Callable, Hashable, Any
 23
 24class uarray(_np.ndarray):
 25
 26    __doc__ = """
 27    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
 28    of [ufloat](https://pypi.org/project/uncertainties) values
 29    """
 30
 31    def __new__(cls, a):
 32        obj = _np.asarray(a).view(cls)
 33        return obj
 34    
 35    n = property(fget = _np.vectorize(lambda x : x.n))
 36    """Return the array of nominal values (read-only)."""
 37
 38    s = property(fget = _np.vectorize(lambda x : x.s))
 39    """Return the array of standard errors (read-only)"""
 40
 41    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
 42    """Return the correlation matrix of the array elements (read-only)"""
 43
 44    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
 45    """Return the covariance matrix of the array elements (read-only)"""
 46
 47    nv = n
 48    "Alias for `uarray.nv`"
 49
 50    se = s
 51    "Alias for `uarray.s`"
 52
 53    cor = correl
 54    "Alias for `uarray.correl`"
 55
 56    cov = covar
 57    "Alias for `uarray.covar`"    
 58
 59
 60def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
 61	'''
 62	Test whether 2-D array `M` is symmetric and positive semidefinite.
 63	'''
 64	return _np.all(_np.linalg.eigvals(M) >= 0) and _np.all(M - M.T == 0)
 65
 66
 67def smart_type(x: str):
 68	'''
 69	Tries to convert string `x` to a float if it includes a decimal point, or
 70	to an integer if it does not. If both attempts fail, return the original
 71	string unchanged.
 72	'''
 73	try:
 74		y = float(x)
 75	except ValueError:
 76		return x
 77	if y % 1 == 0 and '.' not in x:
 78		return int(y)
 79	return y
 80
 81
 82def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 83	'''
 84	Read correlated data from a CSV-like string.
 85	
 86	Column names are interpreted in the following way:
 87	* In most cases, each columns is converted to a dict value, with the corresponding
 88	dict key being the column's label.
 89	* Columns whose label starts with `SE` are interpreted as specifying the standard
 90	error for the latest preceding data column.
 91	* Columns whose label starts with `correl` are interpreted as specifying the
 92	correlation matrix for the latest preceding data column. In that case, column labels
 93	are ignored for the rest of the columns belonging to this matrix.
 94	* Columns whose label starts with `covar` are interpreted as specifying the
 95	covariance matrix for the latest preceding data column. In that case, column labels
 96	are ignored for the rest of the columns belonging to this matrix.
 97	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
 98	the latest preceding data column, by adding an underscore followed by the variable's
 99	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
100	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
101	underscore followed by the two variable labels, joined by a second underscore
102	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
103	correspond, respectively, to the lines and columns of this matrix.
104	* Exceptions will be raised, for any given variable:
105		- when specifying both `covar` and any combination of (`SE`, `correl`)
106		- when specifying `correl` without `SE`
107
108	**Arguments**
109	- `data`: a CSV-like string
110	- `sep`: the CSV separator
111	- `validate_covar`: whether to check that the overall covariance matrix
112	is symmetric and positive semidefinite. Specifying `validate_covar = False`
113	bypasses this computationally expensive step.
114	
115	**Example**
116	```py
117	import correldata
118	data  = """
119	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
120	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
121	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
122	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
123	"""[1:-1]
124	print(correldata.read_data(data))
125	
126	# yields:
127	# 
128	# > {
129	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
130	#     'Tacid': array([90., 90., 90.]),
131	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
132	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
133	#   }
134	```
135	'''
136
137	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
138	N = len(data) - 1
139
140	values, se, correl, covar = {}, {}, {}, {}
141	j = 0
142	while j < len(data[0]):
143		field = data[0][j]
144		if not (
145			field.startswith('SE_')
146			or field.startswith('correl_')
147			or field.startswith('covar_')
148			or field == 'SE'
149			or field == 'correl'
150			or field == 'covar'
151			or len(field) == 0
152		):
153			values[field] = _np.array([l[j] for l in data[1:]])
154			j += 1
155			oldfield = field
156		elif field.startswith('SE_'):
157			se[field[3:]] = _np.array([l[j] for l in data[1:]])
158			j += 1
159		elif field == 'SE':
160			se[oldfield] = _np.array([l[j] for l in data[1:]])
161			j += 1
162		elif field.startswith('correl_'):
163			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
164			j += N
165		elif field == 'correl':
166			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
167			j += N
168		elif field.startswith('covar_'):
169			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
170			j += N
171		elif field == 'covar':
172			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
173			j += N
174
175	nakedvalues = {}
176	for k in [_ for _ in values]:
177		if (
178			k not in se
179			and k not in correl
180			and k not in covar
181		):
182			nakedvalues[k] = values.pop(k)
183
184	for x in values:
185		if x in covar:
186			if x in se:
187				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
188			if x in correl:
189				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
190		if x in correl:
191			if x not in se:
192				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
193
194	for x in correl:
195		if x in values:
196			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
197		else:
198			for x1 in values:
199				for x2 in values:
200					if x == f'{x1}_{x2}':
201						if x1 in se:
202							se1 = se[x1]
203						else:
204							if x1 in covar:
205								se1 = _np.diag(covar[x1])**0.5
206							else:
207								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
208						if x2 in se:
209							se2 = se[x2]
210						else:
211							if x2 in covar:
212								se2 = _np.diag(covar[x2])**0.5
213							else:
214								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
215
216						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
217
218	for x in se:
219		if x in values and x not in correl:
220			covar[x] = _np.diag(se[x]**2)
221
222	for k in [_ for _ in covar]:
223		if k not in values:
224			for j1 in values:
225				for j2 in values:
226					if k == f'{j1}_{j2}':
227						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
228
229	X = _np.array([_ for k in values for _ in values[k]])
230	CM = _np.zeros((X.size, X.size))
231	for i, vi in enumerate(values):
232		for j, vj in enumerate(values):
233			if vi == vj:
234				if vi in covar:
235					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
236			else:
237				if f'{vi}_{vj}' in covar:
238					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
239
240	if validate_covar and not is_symmetric_positive_semidefinite(CM):
241		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
242
243	corvalues = uarray(_uc.correlated_values(X, CM))
244
245	allvalues = nakedvalues
246
247	for i, x in enumerate(values):
248		allvalues[x] = corvalues[i*N:i*N+N]
249
250	return allvalues
251
252
253def read_data_from_file(filename: str | _os.PathLike, **kwargs):
254	'''
255	Read correlated data from a CSV file.
256
257	**Arguments**
258	- `filename`: `str` or path to the file to read from
259	- `kwargs`: passed to correldata.read_data()
260	'''
261	with open(filename) as fid:
262		return read_data(fid.read(), **kwargs)
263
264
265def f2s(
266	x: Any,
267	f: (str | Callable | dict),
268	k: Hashable = None,
269	fb: (str | Callable) = 'z.6g',
270) -> str:
271	'''
272	Format `x` according to format `f`
273	
274	* If `f` is a string, return `f'{x:{f}}'`
275	* If `f` is a callable, return `f(x)`
276	* If `f` is a dict and optional argument `k` is a hashable,
277	  return f2s(x, f[k]), otherwise return f2s(x, fb)
278	'''
279
280	if isinstance (x, str):
281		return x
282	if isinstance (f, str):
283		return f'{x:{f}}'
284	if isinstance (f, Callable):
285		return f(x)
286	if isinstance (f, dict):
287		if k in f:
288			return f2s(x, f[k])
289		if isinstance (fb, str):
290			return f'{x:{fb}}'
291		if isinstance (fb, Callable):
292			return fb(x)
293	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
294	
295
296
297def data_string(
298	data: dict,
299	sep: str = ',',
300	include_fields: list = None,
301	exclude_fields: list = [],
302	float_format: (str | dict | Callable) = 'z.6g',
303	correl_format: (str | dict | Callable) = 'z.6f',
304	default_float_format: (str | Callable) = 'z.6g',
305	default_correl_format: (str | Callable) = 'z.6f',
306	align: str = '>',
307	atol: float = 1e-12,
308	rtol: float = 1e-12,
309):
310	'''
311	Generate CSV-like string from correlated data
312
313	**Arguments**
314	- `data`: dict of arrays with strings, floats or correlated data
315	- `sep`: the CSV separator
316	- `include_fields`: subset of fields to write; if `None`, write all fields
317	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
318	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
319	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
320	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
321	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
322	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
323	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
324	  missing from `float_format.keys()` will use `default_float_format` instead.
325	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
326	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
327	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
328	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
329	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
330	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
331	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
332	
333	
334	**Example**
335	
336	```py
337	from correldata import _uc
338	from correldata import _np
339	from correldata import *
340	
341	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
342	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
343	
344	data = dict(X=X, Y=Y, Z=X+Y)
345	
346	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
347	
348	# yields:
349	# 
350	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
351	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
352	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
353	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
354	```
355	'''
356	if include_fields is None:
357		include_fields = [_ for _ in data]
358	cols, ufields = [], []
359	for f in include_fields:
360		if f in exclude_fields:
361			continue
362		if isinstance(data[f], uarray):
363			ufields.append(f)
364			N = data[f].size
365			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
366			if f'SE_{f}' not in exclude_fields:
367				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
368			if f'correl_{f}' not in exclude_fields:
369				CM = _uc.correlation_matrix(data[f])
370				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
371					for i in range(N):
372						cols.append(
373							['' if i else f'correl_{f}']
374							+ [
375								f2s(
376									CM[i,j],
377									correl_format,
378									f,
379									default_correl_format,
380								)
381								for j in range(N)
382							]
383						)
384
385		else:
386			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
387
388	for i in range(len(ufields)):
389		for j in range(i):
390			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
391				continue
392			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
393			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
394				for k in range(N):
395					cols.append(
396						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
397						+ [
398							f2s(
399								CM[k,l],
400								correl_format,
401								f,
402								default_correl_format,
403							)
404							for l in range(N)
405						]
406					)
407
408	lines = list(map(list, zip(*cols)))
409
410	if align:
411		lengths = [max([len(e) for e in l]) for l in cols]
412		for l in lines:
413			for k,ln in enumerate(lengths):
414				l[k] = f'{l[k]:{align}{ln}s}'
415		return '\n'.join([(sep+' ').join(l) for l in lines])
416
417	return '\n'.join([sep.join(l) for l in lines])
418
419
420
421def save_data_to_file(data, filename, **kwargs):
422	'''
423	Write correlated data to a CSV file.
424
425	**Arguments**
426	- `data`: dict of arrays with strings, floats or correlated data
427	- `filename`: `str` or path to the file to read from
428	- `kwargs`: passed to correldata.data_string()
429	'''
430	with open(filename, 'w') as fid:
431		return fid.write(data_string(data, **kwargs))
class uarray(numpy.ndarray):
25class uarray(_np.ndarray):
26
27    __doc__ = """
28    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
29    of [ufloat](https://pypi.org/project/uncertainties) values
30    """
31
32    def __new__(cls, a):
33        obj = _np.asarray(a).view(cls)
34        return obj
35    
36    n = property(fget = _np.vectorize(lambda x : x.n))
37    """Return the array of nominal values (read-only)."""
38
39    s = property(fget = _np.vectorize(lambda x : x.s))
40    """Return the array of standard errors (read-only)"""
41
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
43    """Return the correlation matrix of the array elements (read-only)"""
44
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
46    """Return the covariance matrix of the array elements (read-only)"""
47
48    nv = n
49    "Alias for `uarray.nv`"
50
51    se = s
52    "Alias for `uarray.s`"
53
54    cor = correl
55    "Alias for `uarray.correl`"
56
57    cov = covar
58    "Alias for `uarray.covar`"    

1-D ndarray of ufloat values

n

Return the array of nominal values (read-only).

s

Return the array of standard errors (read-only)

correl
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Return the correlation matrix of the array elements (read-only)

covar
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Return the covariance matrix of the array elements (read-only)

nv

Alias for uarray.nv

se

Alias for uarray.s

cor
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Alias for uarray.correl

cov
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Alias for uarray.covar

Inherited Members
numpy.ndarray
dumps
dump
all
any
argmax
argmin
argpartition
argsort
astype
byteswap
choose
clip
compress
conj
conjugate
copy
cumprod
cumsum
diagonal
dot
fill
flatten
getfield
item
max
mean
min
nonzero
partition
prod
put
ravel
repeat
reshape
resize
round
searchsorted
setfield
setflags
sort
squeeze
std
sum
swapaxes
take
tobytes
tofile
tolist
tostring
trace
transpose
var
view
to_device
ndim
flags
shape
strides
data
itemsize
size
nbytes
base
dtype
real
imag
flat
ctypes
T
mT
ptp
newbyteorder
itemset
device
def is_symmetric_positive_semidefinite(M: numpy.ndarray) -> bool:
61def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
62	'''
63	Test whether 2-D array `M` is symmetric and positive semidefinite.
64	'''
65	return _np.all(_np.linalg.eigvals(M) >= 0) and _np.all(M - M.T == 0)

Test whether 2-D array M is symmetric and positive semidefinite.

def smart_type(x: str):
68def smart_type(x: str):
69	'''
70	Tries to convert string `x` to a float if it includes a decimal point, or
71	to an integer if it does not. If both attempts fail, return the original
72	string unchanged.
73	'''
74	try:
75		y = float(x)
76	except ValueError:
77		return x
78	if y % 1 == 0 and '.' not in x:
79		return int(y)
80	return y

Tries to convert string x to a float if it includes a decimal point, or to an integer if it does not. If both attempts fail, return the original string unchanged.

def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 83def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 84	'''
 85	Read correlated data from a CSV-like string.
 86	
 87	Column names are interpreted in the following way:
 88	* In most cases, each columns is converted to a dict value, with the corresponding
 89	dict key being the column's label.
 90	* Columns whose label starts with `SE` are interpreted as specifying the standard
 91	error for the latest preceding data column.
 92	* Columns whose label starts with `correl` are interpreted as specifying the
 93	correlation matrix for the latest preceding data column. In that case, column labels
 94	are ignored for the rest of the columns belonging to this matrix.
 95	* Columns whose label starts with `covar` are interpreted as specifying the
 96	covariance matrix for the latest preceding data column. In that case, column labels
 97	are ignored for the rest of the columns belonging to this matrix.
 98	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
 99	the latest preceding data column, by adding an underscore followed by the variable's
100	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
101	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
102	underscore followed by the two variable labels, joined by a second underscore
103	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
104	correspond, respectively, to the lines and columns of this matrix.
105	* Exceptions will be raised, for any given variable:
106		- when specifying both `covar` and any combination of (`SE`, `correl`)
107		- when specifying `correl` without `SE`
108
109	**Arguments**
110	- `data`: a CSV-like string
111	- `sep`: the CSV separator
112	- `validate_covar`: whether to check that the overall covariance matrix
113	is symmetric and positive semidefinite. Specifying `validate_covar = False`
114	bypasses this computationally expensive step.
115	
116	**Example**
117	```py
118	import correldata
119	data  = """
120	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
121	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
122	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
123	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
124	"""[1:-1]
125	print(correldata.read_data(data))
126	
127	# yields:
128	# 
129	# > {
130	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
131	#     'Tacid': array([90., 90., 90.]),
132	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
133	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
134	#   }
135	```
136	'''
137
138	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
139	N = len(data) - 1
140
141	values, se, correl, covar = {}, {}, {}, {}
142	j = 0
143	while j < len(data[0]):
144		field = data[0][j]
145		if not (
146			field.startswith('SE_')
147			or field.startswith('correl_')
148			or field.startswith('covar_')
149			or field == 'SE'
150			or field == 'correl'
151			or field == 'covar'
152			or len(field) == 0
153		):
154			values[field] = _np.array([l[j] for l in data[1:]])
155			j += 1
156			oldfield = field
157		elif field.startswith('SE_'):
158			se[field[3:]] = _np.array([l[j] for l in data[1:]])
159			j += 1
160		elif field == 'SE':
161			se[oldfield] = _np.array([l[j] for l in data[1:]])
162			j += 1
163		elif field.startswith('correl_'):
164			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
165			j += N
166		elif field == 'correl':
167			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
168			j += N
169		elif field.startswith('covar_'):
170			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
171			j += N
172		elif field == 'covar':
173			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
174			j += N
175
176	nakedvalues = {}
177	for k in [_ for _ in values]:
178		if (
179			k not in se
180			and k not in correl
181			and k not in covar
182		):
183			nakedvalues[k] = values.pop(k)
184
185	for x in values:
186		if x in covar:
187			if x in se:
188				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
189			if x in correl:
190				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
191		if x in correl:
192			if x not in se:
193				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
194
195	for x in correl:
196		if x in values:
197			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
198		else:
199			for x1 in values:
200				for x2 in values:
201					if x == f'{x1}_{x2}':
202						if x1 in se:
203							se1 = se[x1]
204						else:
205							if x1 in covar:
206								se1 = _np.diag(covar[x1])**0.5
207							else:
208								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
209						if x2 in se:
210							se2 = se[x2]
211						else:
212							if x2 in covar:
213								se2 = _np.diag(covar[x2])**0.5
214							else:
215								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
216
217						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
218
219	for x in se:
220		if x in values and x not in correl:
221			covar[x] = _np.diag(se[x]**2)
222
223	for k in [_ for _ in covar]:
224		if k not in values:
225			for j1 in values:
226				for j2 in values:
227					if k == f'{j1}_{j2}':
228						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
229
230	X = _np.array([_ for k in values for _ in values[k]])
231	CM = _np.zeros((X.size, X.size))
232	for i, vi in enumerate(values):
233		for j, vj in enumerate(values):
234			if vi == vj:
235				if vi in covar:
236					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
237			else:
238				if f'{vi}_{vj}' in covar:
239					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
240
241	if validate_covar and not is_symmetric_positive_semidefinite(CM):
242		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
243
244	corvalues = uarray(_uc.correlated_values(X, CM))
245
246	allvalues = nakedvalues
247
248	for i, x in enumerate(values):
249		allvalues[x] = corvalues[i*N:i*N+N]
250
251	return allvalues

Read correlated data from a CSV-like string.

Column names are interpreted in the following way:

  • In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
  • Columns whose label starts with SE are interpreted as specifying the standard error for the latest preceding data column.
  • Columns whose label starts with correl are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • Columns whose label starts with covar are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • SE, correl, and covar may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex: SE_foo, correl_bar, covar_baz).
  • correl, and covar may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex: correl_foo_bar, covar_X_Y). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.
  • Exceptions will be raised, for any given variable:
    • when specifying both covar and any combination of (SE, correl)
    • when specifying correl without SE

Arguments

  • data: a CSV-like string
  • sep: the CSV separator
  • validate_covar: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifying validate_covar = False bypasses this computationally expensive step.

Example

import correldata
data  = """
Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
"""[1:-1]
print(read_data(data))

# yields:
# 
# > {
#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
#     'Tacid': array([90., 90., 90.]),
#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
#   }
def read_data_from_file(filename: str | os.PathLike, **kwargs):
254def read_data_from_file(filename: str | _os.PathLike, **kwargs):
255	'''
256	Read correlated data from a CSV file.
257
258	**Arguments**
259	- `filename`: `str` or path to the file to read from
260	- `kwargs`: passed to correldata.read_data()
261	'''
262	with open(filename) as fid:
263		return read_data(fid.read(), **kwargs)

Read correlated data from a CSV file.

Arguments

  • filename: str or path to the file to read from
  • kwargs: passed to read_data()
def f2s( x: Any, f: Union[str, Callable, dict], k: Hashable = None, fb: Union[str, Callable] = 'z.6g') -> str:
266def f2s(
267	x: Any,
268	f: (str | Callable | dict),
269	k: Hashable = None,
270	fb: (str | Callable) = 'z.6g',
271) -> str:
272	'''
273	Format `x` according to format `f`
274	
275	* If `f` is a string, return `f'{x:{f}}'`
276	* If `f` is a callable, return `f(x)`
277	* If `f` is a dict and optional argument `k` is a hashable,
278	  return f2s(x, f[k]), otherwise return f2s(x, fb)
279	'''
280
281	if isinstance (x, str):
282		return x
283	if isinstance (f, str):
284		return f'{x:{f}}'
285	if isinstance (f, Callable):
286		return f(x)
287	if isinstance (f, dict):
288		if k in f:
289			return f2s(x, f[k])
290		if isinstance (fb, str):
291			return f'{x:{fb}}'
292		if isinstance (fb, Callable):
293			return fb(x)
294	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')

Format x according to format f

  • If f is a string, return f'{x:{f}}'
  • If f is a callable, return f(x)
  • If f is a dict and optional argument k is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
def data_string( data: dict, sep: str = ',', include_fields: list = None, exclude_fields: list = [], float_format: Union[str, dict, Callable] = 'z.6g', correl_format: Union[str, dict, Callable] = 'z.6f', default_float_format: Union[str, Callable] = 'z.6g', default_correl_format: Union[str, Callable] = 'z.6f', align: str = '>', atol: float = 1e-12, rtol: float = 1e-12):
298def data_string(
299	data: dict,
300	sep: str = ',',
301	include_fields: list = None,
302	exclude_fields: list = [],
303	float_format: (str | dict | Callable) = 'z.6g',
304	correl_format: (str | dict | Callable) = 'z.6f',
305	default_float_format: (str | Callable) = 'z.6g',
306	default_correl_format: (str | Callable) = 'z.6f',
307	align: str = '>',
308	atol: float = 1e-12,
309	rtol: float = 1e-12,
310):
311	'''
312	Generate CSV-like string from correlated data
313
314	**Arguments**
315	- `data`: dict of arrays with strings, floats or correlated data
316	- `sep`: the CSV separator
317	- `include_fields`: subset of fields to write; if `None`, write all fields
318	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
319	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
320	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
321	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
322	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
323	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
324	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
325	  missing from `float_format.keys()` will use `default_float_format` instead.
326	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
327	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
328	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
329	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
330	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
331	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
332	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
333	
334	
335	**Example**
336	
337	```py
338	from correldata import _uc
339	from correldata import _np
340	from correldata import *
341	
342	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
343	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
344	
345	data = dict(X=X, Y=Y, Z=X+Y)
346	
347	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
348	
349	# yields:
350	# 
351	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
352	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
353	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
354	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
355	```
356	'''
357	if include_fields is None:
358		include_fields = [_ for _ in data]
359	cols, ufields = [], []
360	for f in include_fields:
361		if f in exclude_fields:
362			continue
363		if isinstance(data[f], uarray):
364			ufields.append(f)
365			N = data[f].size
366			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
367			if f'SE_{f}' not in exclude_fields:
368				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
369			if f'correl_{f}' not in exclude_fields:
370				CM = _uc.correlation_matrix(data[f])
371				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
372					for i in range(N):
373						cols.append(
374							['' if i else f'correl_{f}']
375							+ [
376								f2s(
377									CM[i,j],
378									correl_format,
379									f,
380									default_correl_format,
381								)
382								for j in range(N)
383							]
384						)
385
386		else:
387			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
388
389	for i in range(len(ufields)):
390		for j in range(i):
391			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
392				continue
393			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
394			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
395				for k in range(N):
396					cols.append(
397						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
398						+ [
399							f2s(
400								CM[k,l],
401								correl_format,
402								f,
403								default_correl_format,
404							)
405							for l in range(N)
406						]
407					)
408
409	lines = list(map(list, zip(*cols)))
410
411	if align:
412		lengths = [max([len(e) for e in l]) for l in cols]
413		for l in lines:
414			for k,ln in enumerate(lengths):
415				l[k] = f'{l[k]:{align}{ln}s}'
416		return '\n'.join([(sep+' ').join(l) for l in lines])
417
418	return '\n'.join([sep.join(l) for l in lines])

Generate CSV-like string from correlated data

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • sep: the CSV separator
  • include_fields: subset of fields to write; if None, write all fields
  • exclude_fields: subset of fields to ignore (takes precedence over include_fields); to exclude only the SE for field foo, include SE_foo; same goes for correl_foo
  • float_format: formatting for float values. May be a string (ex: 'z.3f'), a callable (ex: lambda x: '.2f' if x else '0'), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex: {'foo': '.2e', 'bar': (lambda x: str(x))}).
  • correl_format: same as float_format, but applies to correlation matrix elements
  • default_float_format: only used when float_format is a dict; in that case, fields missing from float_format.keys() will use default_float_format instead. corresponding to different fields (ex: {'foo': '.2e', 'bar':lambda x: str(x)}).
  • default_correl_format: same as default_float_format, but applies to correl_format
  • align: right-align (>), left-align (<), or don't align (empty string) CSV values
  • atol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
  • rtol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix

Example

from correldata import _uc
from correldata import _np
from correldata import *

X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))

data = dict(X=X, Y=Y, Z=X+Y)

print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))

# yields:
# 
#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
def save_data_to_file(data, filename, **kwargs):
422def save_data_to_file(data, filename, **kwargs):
423	'''
424	Write correlated data to a CSV file.
425
426	**Arguments**
427	- `data`: dict of arrays with strings, floats or correlated data
428	- `filename`: `str` or path to the file to read from
429	- `kwargs`: passed to correldata.data_string()
430	'''
431	with open(filename, 'w') as fid:
432		return fid.write(data_string(data, **kwargs))

Write correlated data to a CSV file.

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • filename: str or path to the file to read from
  • kwargs: passed to data_string()