correldata

Read/write vectors of correlated data from/to a csv file.

These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.

  1"""
  2Read/write vectors of correlated data from/to a csv file.
  3
  4These data are stored in a dictionary, whose values are numpy arrays
  5with elements which may be strings, floats, or floats with associated uncertainties
  6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library.
  7"""
  8
  9
 10__author__    = 'Mathieu Daëron'
 11__contact__   = 'mathieu@daeron.fr'
 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron'
 13__license__   = 'MIT License - https://opensource.org/licenses/MIT'
 14__date__      = '2024-10-13'
 15__version__   = '1.2.2'
 16
 17
 18import os as _os
 19import numpy as _np
 20import uncertainties as _uc
 21
 22from typing import Callable, Hashable, Any
 23
 24class uarray(_np.ndarray):
 25
 26    __doc__ = """
 27    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
 28    of [ufloat](https://pypi.org/project/uncertainties) values
 29    """
 30
 31    def __new__(cls, a):
 32        obj = _np.asarray(a).view(cls)
 33        return obj
 34    
 35    n = property(fget = _np.vectorize(lambda x : x.n))
 36    """Return the array of nominal values (read-only)."""
 37
 38    s = property(fget = _np.vectorize(lambda x : x.s))
 39    """Return the array of standard errors (read-only)"""
 40
 41    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
 42    """Return the correlation matrix of the array elements (read-only)"""
 43
 44    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
 45    """Return the covariance matrix of the array elements (read-only)"""
 46
 47    nv = n
 48    "Alias for `uarray.nv`"
 49
 50    se = s
 51    "Alias for `uarray.s`"
 52
 53    cor = correl
 54    "Alias for `uarray.correl`"
 55
 56    cov = covar
 57    "Alias for `uarray.covar`"    
 58
 59
 60def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
 61	'''
 62	Test whether 2-D array `M` is symmetric and positive semidefinite.
 63	'''
 64	ev = _np.linalg.eigvals(M)
 65	return (
 66		_np.allclose(M, M.T) # M is symmetric
 67		and _np.all(
 68			(ev > 0) | _np.isclose(ev, 0)
 69		) # all eignevalues are either real and strictly positive or close to zero
 70	)
 71
 72
 73def smart_type(x: str):
 74	'''
 75	Tries to convert string `x` to a float if it includes a decimal point, or
 76	to an integer if it does not. If both attempts fail, return the original
 77	string unchanged.
 78	'''
 79	try:
 80		y = float(x)
 81	except ValueError:
 82		return x
 83	if y % 1 == 0 and '.' not in x:
 84		return int(y)
 85	return y
 86
 87
 88def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 89	'''
 90	Read correlated data from a CSV-like string.
 91	
 92	Column names are interpreted in the following way:
 93	* In most cases, each columns is converted to a dict value, with the corresponding
 94	dict key being the column's label.
 95	* Columns whose label starts with `SE` are interpreted as specifying the standard
 96	error for the latest preceding data column.
 97	* Columns whose label starts with `correl` are interpreted as specifying the
 98	correlation matrix for the latest preceding data column. In that case, column labels
 99	are ignored for the rest of the columns belonging to this matrix.
100	* Columns whose label starts with `covar` are interpreted as specifying the
101	covariance matrix for the latest preceding data column. In that case, column labels
102	are ignored for the rest of the columns belonging to this matrix.
103	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
104	the latest preceding data column, by adding an underscore followed by the variable's
105	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
106	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
107	underscore followed by the two variable labels, joined by a second underscore
108	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
109	correspond, respectively, to the lines and columns of this matrix.
110	* Exceptions will be raised, for any given variable:
111		- when specifying both `covar` and any combination of (`SE`, `correl`)
112		- when specifying `correl` without `SE`
113
114	**Arguments**
115	- `data`: a CSV-like string
116	- `sep`: the CSV separator
117	- `validate_covar`: whether to check that the overall covariance matrix
118	is symmetric and positive semidefinite. Specifying `validate_covar = False`
119	bypasses this computationally expensive step.
120	
121	**Example**
122	```py
123	import correldata
124	data  = """
125	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
126	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
127	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
128	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
129	"""[1:-1]
130	print(correldata.read_data(data))
131	
132	# yields:
133	# 
134	# > {
135	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
136	#     'Tacid': array([90., 90., 90.]),
137	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
138	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
139	#   }
140	```
141	'''
142
143	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
144	N = len(data) - 1
145
146	values, se, correl, covar = {}, {}, {}, {}
147	j = 0
148	while j < len(data[0]):
149		field = data[0][j]
150		if not (
151			field.startswith('SE_')
152			or field.startswith('correl_')
153			or field.startswith('covar_')
154			or field == 'SE'
155			or field == 'correl'
156			or field == 'covar'
157			or len(field) == 0
158		):
159			values[field] = _np.array([l[j] for l in data[1:]])
160			j += 1
161			oldfield = field
162		elif field.startswith('SE_'):
163			se[field[3:]] = _np.array([l[j] for l in data[1:]])
164			j += 1
165		elif field == 'SE':
166			se[oldfield] = _np.array([l[j] for l in data[1:]])
167			j += 1
168		elif field.startswith('correl_'):
169			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
170			j += N
171		elif field == 'correl':
172			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
173			j += N
174		elif field.startswith('covar_'):
175			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
176			j += N
177		elif field == 'covar':
178			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
179			j += N
180
181	nakedvalues = {}
182	for k in [_ for _ in values]:
183		if (
184			k not in se
185			and k not in correl
186			and k not in covar
187		):
188			nakedvalues[k] = values.pop(k)
189
190	for x in values:
191		if x in covar:
192			if x in se:
193				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
194			if x in correl:
195				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
196		if x in correl:
197			if x not in se:
198				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
199
200	for x in correl:
201		if x in values:
202			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
203		else:
204			for x1 in values:
205				for x2 in values:
206					if x == f'{x1}_{x2}':
207						if x1 in se:
208							se1 = se[x1]
209						else:
210							if x1 in covar:
211								se1 = _np.diag(covar[x1])**0.5
212							else:
213								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
214						if x2 in se:
215							se2 = se[x2]
216						else:
217							if x2 in covar:
218								se2 = _np.diag(covar[x2])**0.5
219							else:
220								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
221
222						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
223
224	for x in se:
225		if x in values and x not in correl:
226			covar[x] = _np.diag(se[x]**2)
227
228	for k in [_ for _ in covar]:
229		if k not in values:
230			for j1 in values:
231				for j2 in values:
232					if k == f'{j1}_{j2}':
233						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
234
235	X = _np.array([_ for k in values for _ in values[k]])
236	CM = _np.zeros((X.size, X.size))
237	for i, vi in enumerate(values):
238		for j, vj in enumerate(values):
239			if vi == vj:
240				if vi in covar:
241					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
242			else:
243				if f'{vi}_{vj}' in covar:
244					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
245
246	if validate_covar and not is_symmetric_positive_semidefinite(CM):
247		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
248
249	corvalues = uarray(_uc.correlated_values(X, CM))
250
251	allvalues = nakedvalues
252
253	for i, x in enumerate(values):
254		allvalues[x] = corvalues[i*N:i*N+N]
255
256	return allvalues
257
258
259def read_data_from_file(filename: str | _os.PathLike, **kwargs):
260	'''
261	Read correlated data from a CSV file.
262
263	**Arguments**
264	- `filename`: `str` or path to the file to read from
265	- `kwargs`: passed to correldata.read_data()
266	'''
267	with open(filename) as fid:
268		return read_data(fid.read(), **kwargs)
269
270
271def f2s(
272	x: Any,
273	f: (str | Callable | dict),
274	k: Hashable = None,
275	fb: (str | Callable) = 'z.6g',
276) -> str:
277	'''
278	Format `x` according to format `f`
279	
280	* If `f` is a string, return `f'{x:{f}}'`
281	* If `f` is a callable, return `f(x)`
282	* If `f` is a dict and optional argument `k` is a hashable,
283	  return f2s(x, f[k]), otherwise return f2s(x, fb)
284	'''
285
286	if isinstance (x, str):
287		return x
288	if isinstance (f, str):
289		return f'{x:{f}}'
290	if isinstance (f, Callable):
291		return f(x)
292	if isinstance (f, dict):
293		if k in f:
294			return f2s(x, f[k])
295		if isinstance (fb, str):
296			return f'{x:{fb}}'
297		if isinstance (fb, Callable):
298			return fb(x)
299	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
300	
301
302
303def data_string(
304	data: dict,
305	sep: str = ',',
306	include_fields: list = None,
307	exclude_fields: list = [],
308	float_format: (str | dict | Callable) = 'z.6g',
309	correl_format: (str | dict | Callable) = 'z.6f',
310	default_float_format: (str | Callable) = 'z.6g',
311	default_correl_format: (str | Callable) = 'z.6f',
312	align: str = '>',
313	atol: float = 1e-12,
314	rtol: float = 1e-12,
315):
316	'''
317	Generate CSV-like string from correlated data
318
319	**Arguments**
320	- `data`: dict of arrays with strings, floats or correlated data
321	- `sep`: the CSV separator
322	- `include_fields`: subset of fields to write; if `None`, write all fields
323	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
324	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
325	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
326	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
327	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
328	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
329	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
330	  missing from `float_format.keys()` will use `default_float_format` instead.
331	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
332	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
333	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
334	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
335	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
336	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
337	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
338	
339	
340	**Example**
341	
342	```py
343	from correldata import _uc
344	from correldata import _np
345	from correldata import *
346	
347	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
348	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
349	
350	data = dict(X=X, Y=Y, Z=X+Y)
351	
352	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
353	
354	# yields:
355	# 
356	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
357	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
358	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
359	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
360	```
361	'''
362	if include_fields is None:
363		include_fields = [_ for _ in data]
364	cols, ufields = [], []
365	for f in include_fields:
366		if f in exclude_fields:
367			continue
368		if isinstance(data[f], uarray):
369			ufields.append(f)
370			N = data[f].size
371			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
372			if f'SE_{f}' not in exclude_fields:
373				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
374			if f'correl_{f}' not in exclude_fields:
375				CM = _uc.correlation_matrix(data[f])
376				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
377					for i in range(N):
378						cols.append(
379							['' if i else f'correl_{f}']
380							+ [
381								f2s(
382									CM[i,j],
383									correl_format,
384									f,
385									default_correl_format,
386								)
387								for j in range(N)
388							]
389						)
390
391		else:
392			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
393
394	for i in range(len(ufields)):
395		for j in range(i):
396			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
397				continue
398			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
399			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
400				for k in range(N):
401					cols.append(
402						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
403						+ [
404							f2s(
405								CM[k,l],
406								correl_format,
407								f,
408								default_correl_format,
409							)
410							for l in range(N)
411						]
412					)
413
414	lines = list(map(list, zip(*cols)))
415
416	if align:
417		lengths = [max([len(e) for e in l]) for l in cols]
418		for l in lines:
419			for k,ln in enumerate(lengths):
420				l[k] = f'{l[k]:{align}{ln}s}'
421		return '\n'.join([(sep+' ').join(l) for l in lines])
422
423	return '\n'.join([sep.join(l) for l in lines])
424
425
426
427def save_data_to_file(data, filename, **kwargs):
428	'''
429	Write correlated data to a CSV file.
430
431	**Arguments**
432	- `data`: dict of arrays with strings, floats or correlated data
433	- `filename`: `str` or path to the file to read from
434	- `kwargs`: passed to correldata.data_string()
435	'''
436	with open(filename, 'w') as fid:
437		return fid.write(data_string(data, **kwargs))
class uarray(numpy.ndarray):
25class uarray(_np.ndarray):
26
27    __doc__ = """
28    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
29    of [ufloat](https://pypi.org/project/uncertainties) values
30    """
31
32    def __new__(cls, a):
33        obj = _np.asarray(a).view(cls)
34        return obj
35    
36    n = property(fget = _np.vectorize(lambda x : x.n))
37    """Return the array of nominal values (read-only)."""
38
39    s = property(fget = _np.vectorize(lambda x : x.s))
40    """Return the array of standard errors (read-only)"""
41
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
43    """Return the correlation matrix of the array elements (read-only)"""
44
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
46    """Return the covariance matrix of the array elements (read-only)"""
47
48    nv = n
49    "Alias for `uarray.nv`"
50
51    se = s
52    "Alias for `uarray.s`"
53
54    cor = correl
55    "Alias for `uarray.correl`"
56
57    cov = covar
58    "Alias for `uarray.covar`"    

1-D ndarray of ufloat values

n

Return the array of nominal values (read-only).

s

Return the array of standard errors (read-only)

correl
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Return the correlation matrix of the array elements (read-only)

covar
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Return the covariance matrix of the array elements (read-only)

nv

Alias for uarray.nv

se

Alias for uarray.s

cor
42    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Alias for uarray.correl

cov
45    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Alias for uarray.covar

Inherited Members
numpy.ndarray
dumps
dump
all
any
argmax
argmin
argpartition
argsort
astype
byteswap
choose
clip
compress
conj
conjugate
copy
cumprod
cumsum
diagonal
dot
fill
flatten
getfield
item
max
mean
min
nonzero
partition
prod
put
ravel
repeat
reshape
resize
round
searchsorted
setfield
setflags
sort
squeeze
std
sum
swapaxes
take
tobytes
tofile
tolist
tostring
trace
transpose
var
view
to_device
ndim
flags
shape
strides
data
itemsize
size
nbytes
base
dtype
real
imag
flat
ctypes
T
mT
ptp
newbyteorder
itemset
device
def is_symmetric_positive_semidefinite(M: numpy.ndarray) -> bool:
61def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
62	'''
63	Test whether 2-D array `M` is symmetric and positive semidefinite.
64	'''
65	ev = _np.linalg.eigvals(M)
66	return (
67		_np.allclose(M, M.T) # M is symmetric
68		and _np.all(
69			(ev > 0) | _np.isclose(ev, 0)
70		) # all eignevalues are either real and strictly positive or close to zero
71	)

Test whether 2-D array M is symmetric and positive semidefinite.

def smart_type(x: str):
74def smart_type(x: str):
75	'''
76	Tries to convert string `x` to a float if it includes a decimal point, or
77	to an integer if it does not. If both attempts fail, return the original
78	string unchanged.
79	'''
80	try:
81		y = float(x)
82	except ValueError:
83		return x
84	if y % 1 == 0 and '.' not in x:
85		return int(y)
86	return y

Tries to convert string x to a float if it includes a decimal point, or to an integer if it does not. If both attempts fail, return the original string unchanged.

def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 89def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 90	'''
 91	Read correlated data from a CSV-like string.
 92	
 93	Column names are interpreted in the following way:
 94	* In most cases, each columns is converted to a dict value, with the corresponding
 95	dict key being the column's label.
 96	* Columns whose label starts with `SE` are interpreted as specifying the standard
 97	error for the latest preceding data column.
 98	* Columns whose label starts with `correl` are interpreted as specifying the
 99	correlation matrix for the latest preceding data column. In that case, column labels
100	are ignored for the rest of the columns belonging to this matrix.
101	* Columns whose label starts with `covar` are interpreted as specifying the
102	covariance matrix for the latest preceding data column. In that case, column labels
103	are ignored for the rest of the columns belonging to this matrix.
104	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
105	the latest preceding data column, by adding an underscore followed by the variable's
106	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
107	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
108	underscore followed by the two variable labels, joined by a second underscore
109	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
110	correspond, respectively, to the lines and columns of this matrix.
111	* Exceptions will be raised, for any given variable:
112		- when specifying both `covar` and any combination of (`SE`, `correl`)
113		- when specifying `correl` without `SE`
114
115	**Arguments**
116	- `data`: a CSV-like string
117	- `sep`: the CSV separator
118	- `validate_covar`: whether to check that the overall covariance matrix
119	is symmetric and positive semidefinite. Specifying `validate_covar = False`
120	bypasses this computationally expensive step.
121	
122	**Example**
123	```py
124	import correldata
125	data  = """
126	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
127	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
128	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
129	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
130	"""[1:-1]
131	print(correldata.read_data(data))
132	
133	# yields:
134	# 
135	# > {
136	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
137	#     'Tacid': array([90., 90., 90.]),
138	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
139	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
140	#   }
141	```
142	'''
143
144	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
145	N = len(data) - 1
146
147	values, se, correl, covar = {}, {}, {}, {}
148	j = 0
149	while j < len(data[0]):
150		field = data[0][j]
151		if not (
152			field.startswith('SE_')
153			or field.startswith('correl_')
154			or field.startswith('covar_')
155			or field == 'SE'
156			or field == 'correl'
157			or field == 'covar'
158			or len(field) == 0
159		):
160			values[field] = _np.array([l[j] for l in data[1:]])
161			j += 1
162			oldfield = field
163		elif field.startswith('SE_'):
164			se[field[3:]] = _np.array([l[j] for l in data[1:]])
165			j += 1
166		elif field == 'SE':
167			se[oldfield] = _np.array([l[j] for l in data[1:]])
168			j += 1
169		elif field.startswith('correl_'):
170			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
171			j += N
172		elif field == 'correl':
173			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
174			j += N
175		elif field.startswith('covar_'):
176			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
177			j += N
178		elif field == 'covar':
179			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
180			j += N
181
182	nakedvalues = {}
183	for k in [_ for _ in values]:
184		if (
185			k not in se
186			and k not in correl
187			and k not in covar
188		):
189			nakedvalues[k] = values.pop(k)
190
191	for x in values:
192		if x in covar:
193			if x in se:
194				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
195			if x in correl:
196				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
197		if x in correl:
198			if x not in se:
199				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
200
201	for x in correl:
202		if x in values:
203			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
204		else:
205			for x1 in values:
206				for x2 in values:
207					if x == f'{x1}_{x2}':
208						if x1 in se:
209							se1 = se[x1]
210						else:
211							if x1 in covar:
212								se1 = _np.diag(covar[x1])**0.5
213							else:
214								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
215						if x2 in se:
216							se2 = se[x2]
217						else:
218							if x2 in covar:
219								se2 = _np.diag(covar[x2])**0.5
220							else:
221								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
222
223						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
224
225	for x in se:
226		if x in values and x not in correl:
227			covar[x] = _np.diag(se[x]**2)
228
229	for k in [_ for _ in covar]:
230		if k not in values:
231			for j1 in values:
232				for j2 in values:
233					if k == f'{j1}_{j2}':
234						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
235
236	X = _np.array([_ for k in values for _ in values[k]])
237	CM = _np.zeros((X.size, X.size))
238	for i, vi in enumerate(values):
239		for j, vj in enumerate(values):
240			if vi == vj:
241				if vi in covar:
242					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
243			else:
244				if f'{vi}_{vj}' in covar:
245					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
246
247	if validate_covar and not is_symmetric_positive_semidefinite(CM):
248		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
249
250	corvalues = uarray(_uc.correlated_values(X, CM))
251
252	allvalues = nakedvalues
253
254	for i, x in enumerate(values):
255		allvalues[x] = corvalues[i*N:i*N+N]
256
257	return allvalues

Read correlated data from a CSV-like string.

Column names are interpreted in the following way:

  • In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
  • Columns whose label starts with SE are interpreted as specifying the standard error for the latest preceding data column.
  • Columns whose label starts with correl are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • Columns whose label starts with covar are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • SE, correl, and covar may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex: SE_foo, correl_bar, covar_baz).
  • correl, and covar may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex: correl_foo_bar, covar_X_Y). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.
  • Exceptions will be raised, for any given variable:
    • when specifying both covar and any combination of (SE, correl)
    • when specifying correl without SE

Arguments

  • data: a CSV-like string
  • sep: the CSV separator
  • validate_covar: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifying validate_covar = False bypasses this computationally expensive step.

Example

import correldata
data  = """
Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
"""[1:-1]
print(read_data(data))

# yields:
# 
# > {
#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
#     'Tacid': array([90., 90., 90.]),
#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
#   }
def read_data_from_file(filename: str | os.PathLike, **kwargs):
260def read_data_from_file(filename: str | _os.PathLike, **kwargs):
261	'''
262	Read correlated data from a CSV file.
263
264	**Arguments**
265	- `filename`: `str` or path to the file to read from
266	- `kwargs`: passed to correldata.read_data()
267	'''
268	with open(filename) as fid:
269		return read_data(fid.read(), **kwargs)

Read correlated data from a CSV file.

Arguments

  • filename: str or path to the file to read from
  • kwargs: passed to read_data()
def f2s( x: Any, f: Union[str, Callable, dict], k: Hashable = None, fb: Union[str, Callable] = 'z.6g') -> str:
272def f2s(
273	x: Any,
274	f: (str | Callable | dict),
275	k: Hashable = None,
276	fb: (str | Callable) = 'z.6g',
277) -> str:
278	'''
279	Format `x` according to format `f`
280	
281	* If `f` is a string, return `f'{x:{f}}'`
282	* If `f` is a callable, return `f(x)`
283	* If `f` is a dict and optional argument `k` is a hashable,
284	  return f2s(x, f[k]), otherwise return f2s(x, fb)
285	'''
286
287	if isinstance (x, str):
288		return x
289	if isinstance (f, str):
290		return f'{x:{f}}'
291	if isinstance (f, Callable):
292		return f(x)
293	if isinstance (f, dict):
294		if k in f:
295			return f2s(x, f[k])
296		if isinstance (fb, str):
297			return f'{x:{fb}}'
298		if isinstance (fb, Callable):
299			return fb(x)
300	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')

Format x according to format f

  • If f is a string, return f'{x:{f}}'
  • If f is a callable, return f(x)
  • If f is a dict and optional argument k is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
def data_string( data: dict, sep: str = ',', include_fields: list = None, exclude_fields: list = [], float_format: Union[str, dict, Callable] = 'z.6g', correl_format: Union[str, dict, Callable] = 'z.6f', default_float_format: Union[str, Callable] = 'z.6g', default_correl_format: Union[str, Callable] = 'z.6f', align: str = '>', atol: float = 1e-12, rtol: float = 1e-12):
304def data_string(
305	data: dict,
306	sep: str = ',',
307	include_fields: list = None,
308	exclude_fields: list = [],
309	float_format: (str | dict | Callable) = 'z.6g',
310	correl_format: (str | dict | Callable) = 'z.6f',
311	default_float_format: (str | Callable) = 'z.6g',
312	default_correl_format: (str | Callable) = 'z.6f',
313	align: str = '>',
314	atol: float = 1e-12,
315	rtol: float = 1e-12,
316):
317	'''
318	Generate CSV-like string from correlated data
319
320	**Arguments**
321	- `data`: dict of arrays with strings, floats or correlated data
322	- `sep`: the CSV separator
323	- `include_fields`: subset of fields to write; if `None`, write all fields
324	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
325	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
326	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
327	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
328	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
329	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
330	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
331	  missing from `float_format.keys()` will use `default_float_format` instead.
332	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
333	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
334	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
335	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
336	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
337	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
338	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
339	
340	
341	**Example**
342	
343	```py
344	from correldata import _uc
345	from correldata import _np
346	from correldata import *
347	
348	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
349	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
350	
351	data = dict(X=X, Y=Y, Z=X+Y)
352	
353	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
354	
355	# yields:
356	# 
357	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
358	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
359	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
360	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
361	```
362	'''
363	if include_fields is None:
364		include_fields = [_ for _ in data]
365	cols, ufields = [], []
366	for f in include_fields:
367		if f in exclude_fields:
368			continue
369		if isinstance(data[f], uarray):
370			ufields.append(f)
371			N = data[f].size
372			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
373			if f'SE_{f}' not in exclude_fields:
374				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
375			if f'correl_{f}' not in exclude_fields:
376				CM = _uc.correlation_matrix(data[f])
377				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
378					for i in range(N):
379						cols.append(
380							['' if i else f'correl_{f}']
381							+ [
382								f2s(
383									CM[i,j],
384									correl_format,
385									f,
386									default_correl_format,
387								)
388								for j in range(N)
389							]
390						)
391
392		else:
393			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
394
395	for i in range(len(ufields)):
396		for j in range(i):
397			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
398				continue
399			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
400			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
401				for k in range(N):
402					cols.append(
403						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
404						+ [
405							f2s(
406								CM[k,l],
407								correl_format,
408								f,
409								default_correl_format,
410							)
411							for l in range(N)
412						]
413					)
414
415	lines = list(map(list, zip(*cols)))
416
417	if align:
418		lengths = [max([len(e) for e in l]) for l in cols]
419		for l in lines:
420			for k,ln in enumerate(lengths):
421				l[k] = f'{l[k]:{align}{ln}s}'
422		return '\n'.join([(sep+' ').join(l) for l in lines])
423
424	return '\n'.join([sep.join(l) for l in lines])

Generate CSV-like string from correlated data

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • sep: the CSV separator
  • include_fields: subset of fields to write; if None, write all fields
  • exclude_fields: subset of fields to ignore (takes precedence over include_fields); to exclude only the SE for field foo, include SE_foo; same goes for correl_foo
  • float_format: formatting for float values. May be a string (ex: 'z.3f'), a callable (ex: lambda x: '.2f' if x else '0'), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex: {'foo': '.2e', 'bar': (lambda x: str(x))}).
  • correl_format: same as float_format, but applies to correlation matrix elements
  • default_float_format: only used when float_format is a dict; in that case, fields missing from float_format.keys() will use default_float_format instead. corresponding to different fields (ex: {'foo': '.2e', 'bar':lambda x: str(x)}).
  • default_correl_format: same as default_float_format, but applies to correl_format
  • align: right-align (>), left-align (<), or don't align (empty string) CSV values
  • atol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
  • rtol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix

Example

from correldata import _uc
from correldata import _np
from correldata import *

X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))

data = dict(X=X, Y=Y, Z=X+Y)

print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))

# yields:
# 
#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
def save_data_to_file(data, filename, **kwargs):
428def save_data_to_file(data, filename, **kwargs):
429	'''
430	Write correlated data to a CSV file.
431
432	**Arguments**
433	- `data`: dict of arrays with strings, floats or correlated data
434	- `filename`: `str` or path to the file to read from
435	- `kwargs`: passed to correldata.data_string()
436	'''
437	with open(filename, 'w') as fid:
438		return fid.write(data_string(data, **kwargs))

Write correlated data to a CSV file.

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • filename: str or path to the file to read from
  • kwargs: passed to data_string()