correldata

Read/write vectors of correlated data from/to a csv file.

These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.

  1"""
  2Read/write vectors of correlated data from/to a csv file.
  3
  4These data are stored in a dictionary, whose values are numpy arrays
  5with elements which may be strings, floats, or floats with associated uncertainties
  6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library.
  7"""
  8
  9
 10__author__    = 'Mathieu Daëron'
 11__contact__   = 'mathieu@daeron.fr'
 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron'
 13__license__   = 'MIT License - https://opensource.org/licenses/MIT'
 14__date__      = '2024-10-17'
 15__version__   = '1.4.0'
 16
 17
 18import os as _os
 19import numpy as _np
 20import uncertainties as _uc
 21
 22from typing import Callable, Hashable, Any
 23
 24class uarray(_np.ndarray):
 25
 26	__doc__ = """
 27	1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
 28	of [ufloat](https://pypi.org/project/uncertainties) values
 29	"""
 30
 31	def __new__(cls, a):
 32		obj = _np.asarray(a).view(cls)
 33		return obj
 34	
 35	@property
 36	def nv(self):
 37		"""Return the array of nominal values (read-only)."""
 38		return _uc.unumpy.nominal_values(_np.array(self))
 39
 40	@property
 41	def se(self):
 42		"""Return the array of standard errors (read-only)"""
 43		return _uc.unumpy.std_devs(_np.array(self))
 44
 45	@property
 46	def correl(self):
 47		"""Return the correlation matrix of the array elements (read-only)"""
 48		return _np.array(_uc.correlation_matrix(self))
 49
 50	@property
 51	def covar(self):
 52		"""Return the covariance matrix of the array elements (read-only)"""
 53		return _np.array(_uc.covariance_matrix(self))
 54	
 55	@property
 56	def mahalanobis(self):
 57		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
 58		flatself = self.n.flatten().reshape((1, self.size))
 59		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
 60	
 61	n = nv
 62	"Alias for `uarray.nv`"
 63	
 64	s = se
 65	"Alias for `uarray.se`"
 66	
 67	cor = correl
 68	"Alias for `uarray.correl`"
 69	
 70	cov = covar
 71	"Alias for `uarray.covar`"
 72	
 73	m = mahalanobis
 74	"Alias for `uarray.mahalanobis`"
 75
 76
 77def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
 78	'''
 79	Test whether 2-D array `M` is symmetric and positive semidefinite.
 80	'''
 81	ev = _np.linalg.eigvals(M)
 82	return (
 83		_np.allclose(M, M.T) # M is symmetric
 84		and _np.all(
 85			(ev > 0) | _np.isclose(ev, 0)
 86		) # all eignevalues are either real and strictly positive or close to zero
 87	)
 88
 89
 90def smart_type(s: str) -> (int | float | str):
 91	'''
 92	Tries to convert string `s` to an `int`, or to an `float` if that fails.
 93	If both fail, return the original string unchanged.
 94	'''
 95	try: return int(s)
 96	except: pass
 97	try: return float(s)
 98	except: pass
 99	return s
100
101
102def read_data(data: str, sep: str = ',', validate_covar: bool = True):
103	'''
104	Read correlated data from a CSV-like string.
105	
106	Column names are interpreted in the following way:
107	* In most cases, each columns is converted to a dict value, with the corresponding
108	dict key being the column's label.
109	* Columns whose label starts with `SE` are interpreted as specifying the standard
110	error for the latest preceding data column.
111	* Columns whose label starts with `correl` are interpreted as specifying the
112	correlation matrix for the latest preceding data column. In that case, column labels
113	are ignored for the rest of the columns belonging to this matrix.
114	* Columns whose label starts with `covar` are interpreted as specifying the
115	covariance matrix for the latest preceding data column. In that case, column labels
116	are ignored for the rest of the columns belonging to this matrix.
117	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
118	the latest preceding data column, by adding an underscore followed by the variable's
119	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
120	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
121	underscore followed by the two variable labels, joined by a second underscore
122	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
123	correspond, respectively, to the lines and columns of this matrix.
124	* Exceptions will be raised, for any given variable:
125		- when specifying both `covar` and any combination of (`SE`, `correl`)
126		- when specifying `correl` without `SE`
127
128	**Arguments**
129	- `data`: a CSV-like string
130	- `sep`: the CSV separator
131	- `validate_covar`: whether to check that the overall covariance matrix
132	is symmetric and positive semidefinite. Specifying `validate_covar = False`
133	bypasses this computationally expensive step.
134	
135	**Example**
136	```py
137	import correldata
138	data  = """
139	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
140	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
141	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
142	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
143	"""[1:-1]
144	print(correldata.read_data(data))
145	
146	# yields:
147	# 
148	# > {
149	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
150	#     'Tacid': array([90., 90., 90.]),
151	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
152	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
153	#   }
154	```
155	'''
156
157	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
158	N = len(data) - 1
159
160	values, se, correl, covar = {}, {}, {}, {}
161	j = 0
162	while j < len(data[0]):
163		field = data[0][j]
164		if not (
165			field.startswith('SE_')
166			or field.startswith('correl_')
167			or field.startswith('covar_')
168			or field == 'SE'
169			or field == 'correl'
170			or field == 'covar'
171			or len(field) == 0
172		):
173			values[field] = _np.array([l[j] for l in data[1:]])
174			j += 1
175			oldfield = field
176		elif field.startswith('SE_'):
177			se[field[3:]] = _np.array([l[j] for l in data[1:]])
178			j += 1
179		elif field == 'SE':
180			se[oldfield] = _np.array([l[j] for l in data[1:]])
181			j += 1
182		elif field.startswith('correl_'):
183			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
184			j += N
185		elif field == 'correl':
186			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
187			j += N
188		elif field.startswith('covar_'):
189			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
190			j += N
191		elif field == 'covar':
192			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
193			j += N
194
195	nakedvalues = {}
196	for k in [_ for _ in values]:
197		if (
198			k not in se
199			and k not in correl
200			and k not in covar
201		):
202			nakedvalues[k] = values.pop(k)
203
204	for x in values:
205		if x in covar:
206			if x in se:
207				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
208			if x in correl:
209				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
210		if x in correl:
211			if x not in se:
212				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
213
214	for x in correl:
215		if x in values:
216			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
217		else:
218			for x1 in values:
219				for x2 in values:
220					if x == f'{x1}_{x2}':
221						if x1 in se:
222							se1 = se[x1]
223						else:
224							if x1 in covar:
225								se1 = _np.diag(covar[x1])**0.5
226							else:
227								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
228						if x2 in se:
229							se2 = se[x2]
230						else:
231							if x2 in covar:
232								se2 = _np.diag(covar[x2])**0.5
233							else:
234								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
235
236						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
237
238	for x in se:
239		if x in values and x not in correl:
240			covar[x] = _np.diag(se[x]**2)
241
242	for k in [_ for _ in covar]:
243		if k not in values:
244			for j1 in values:
245				for j2 in values:
246					if k == f'{j1}_{j2}':
247						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
248
249	X = _np.array([_ for k in values for _ in values[k]])
250	CM = _np.zeros((X.size, X.size))
251	for i, vi in enumerate(values):
252		for j, vj in enumerate(values):
253			if vi == vj:
254				if vi in covar:
255					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
256			else:
257				if f'{vi}_{vj}' in covar:
258					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
259
260	s = _np.diag(CM)**.5
261	s[s==0] = 1.
262	invs = _np.diag(s**-1)
263
264	if (
265		validate_covar
266		and not (
267			is_symmetric_positive_semidefinite(CM)
268			or is_symmetric_positive_semidefinite(invs @ CM @ invs)
269		)
270	):
271		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
272
273	corvalues = uarray(_uc.correlated_values(X, CM))
274
275	allvalues = nakedvalues
276
277	for i, x in enumerate(values):
278		allvalues[x] = corvalues[i*N:i*N+N]
279
280	return allvalues
281
282
283def read_data_from_file(filename: str | _os.PathLike, **kwargs):
284	'''
285	Read correlated data from a CSV file.
286
287	**Arguments**
288	- `filename`: `str` or path to the file to read from
289	- `kwargs`: passed to correldata.read_data()
290	'''
291	with open(filename) as fid:
292		return read_data(fid.read(), **kwargs)
293
294
295def f2s(
296	x: Any,
297	f: (str | Callable | dict),
298	k: Hashable = None,
299	fb: (str | Callable) = 'z.6g',
300) -> str:
301	'''
302	Format `x` according to format `f`
303	
304	* If `f` is a string, return `f'{x:{f}}'`
305	* If `f` is a callable, return `f(x)`
306	* If `f` is a dict and optional argument `k` is a hashable,
307	  return f2s(x, f[k]), otherwise return f2s(x, fb)
308	'''
309	if isinstance (x, str):
310		return x
311	if isinstance (f, str):
312		return f'{x:{f}}'
313	if isinstance (f, Callable):
314		return f(x)
315	if isinstance (f, dict):
316		if k in f:
317			return f2s(x, f[k])
318		if isinstance (fb, str):
319			return f'{x:{fb}}'
320		if isinstance (fb, Callable):
321			return fb(x)
322	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
323	
324
325
326def data_string(
327	data: dict,
328	sep: str = ',',
329	include_fields: list = None,
330	exclude_fields: list = [],
331	float_format: (str | dict | Callable) = 'z.6g',
332	correl_format: (str | dict | Callable) = 'z.6f',
333	default_float_format: (str | Callable) = 'z.6g',
334	default_correl_format: (str | Callable) = 'z.6f',
335	show_nv: bool = True,
336	show_se: bool = True,
337	show_correl: bool = True,
338	show_mixed_correl: bool = True,
339	align: str = '>',
340	atol: float = 1e-12,
341	rtol: float = 1e-12,
342):
343	'''
344	Generate CSV-like string from correlated data
345
346	**Arguments**
347	- `data`: dict of arrays with strings, floats or correlated data
348	- `sep`: the CSV separator
349	- `include_fields`: subset of fields to write; if `None`, write all fields
350	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
351	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
352	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
353	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
354	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
355	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
356	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
357	  missing from `float_format.keys()` will use `default_float_format` instead.
358	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
359	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
360	- `show_nv`: show nominal values
361	- `show_se`: show standard errors
362	- `show_correl`: show correlations for any given field (ex: `correl_X`)
363	- `show_mixed_correl`:  show correlations between different fields (ex: `correl_X_Y`)
364	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
365	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
366	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
367	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
368	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
369	
370	
371	**Example**
372	
373	```py
374	from correldata import _uc
375	from correldata import _np
376	from correldata import *
377	
378	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
379	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
380	
381	data = dict(X=X, Y=Y, Z=X+Y)
382	
383	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
384	
385	# yields:
386	# 
387	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
388	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
389	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
390	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
391	```
392	'''
393	if include_fields is None:
394		include_fields = [_ for _ in data]
395	cols, ufields = [], []
396	for f in include_fields:
397		if f in exclude_fields:
398			continue
399		if isinstance(data[f], uarray):
400			ufields.append(f)
401			N = data[f].size
402			if show_nv:
403				cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
404			if show_se and (f'SE_{f}' not in exclude_fields):
405				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
406			if show_correl and (f'correl_{f}' not in exclude_fields):
407				CM = _uc.correlation_matrix(data[f])
408				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
409					for i in range(N):
410						cols.append(
411							['' if i else f'correl_{f}']
412							+ [
413								f2s(
414									CM[i,j],
415									correl_format,
416									f,
417									default_correl_format,
418								)
419								for j in range(N)
420							]
421						)
422		elif show_nv:
423				cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
424
425	if show_mixed_correl:
426		for i in range(len(ufields)):
427			for j in range(i):
428				if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
429					continue
430				CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
431				if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
432					for k in range(N):
433						cols.append(
434							['' if k else f'correl_{ufields[j]}_{ufields[i]}']
435							+ [
436								f2s(
437									CM[k,l],
438									correl_format,
439									f,
440									default_correl_format,
441								)
442								for l in range(N)
443							]
444						)
445
446	lines = list(map(list, zip(*cols)))
447
448	if align:
449		lengths = [max([len(e) for e in l]) for l in cols]
450		for l in lines:
451			for k,ln in enumerate(lengths):
452				l[k] = f'{l[k]:{align}{ln}s}'
453		return '\n'.join([(sep+' ').join(l) for l in lines])
454
455	return '\n'.join([sep.join(l) for l in lines])
456
457
458
459def save_data_to_file(data, filename, **kwargs):
460	'''
461	Write correlated data to a CSV file.
462
463	**Arguments**
464	- `data`: dict of arrays with strings, floats or correlated data
465	- `filename`: `str` or path to the file to read from
466	- `kwargs`: passed to correldata.data_string()
467	'''
468	with open(filename, 'w') as fid:
469		return fid.write(data_string(data, **kwargs))
class uarray(numpy.ndarray):
25class uarray(_np.ndarray):
26
27	__doc__ = """
28	1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
29	of [ufloat](https://pypi.org/project/uncertainties) values
30	"""
31
32	def __new__(cls, a):
33		obj = _np.asarray(a).view(cls)
34		return obj
35	
36	@property
37	def nv(self):
38		"""Return the array of nominal values (read-only)."""
39		return _uc.unumpy.nominal_values(_np.array(self))
40
41	@property
42	def se(self):
43		"""Return the array of standard errors (read-only)"""
44		return _uc.unumpy.std_devs(_np.array(self))
45
46	@property
47	def correl(self):
48		"""Return the correlation matrix of the array elements (read-only)"""
49		return _np.array(_uc.correlation_matrix(self))
50
51	@property
52	def covar(self):
53		"""Return the covariance matrix of the array elements (read-only)"""
54		return _np.array(_uc.covariance_matrix(self))
55	
56	@property
57	def mahalanobis(self):
58		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
59		flatself = self.n.flatten().reshape((1, self.size))
60		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
61	
62	n = nv
63	"Alias for `uarray.nv`"
64	
65	s = se
66	"Alias for `uarray.se`"
67	
68	cor = correl
69	"Alias for `uarray.correl`"
70	
71	cov = covar
72	"Alias for `uarray.covar`"
73	
74	m = mahalanobis
75	"Alias for `uarray.mahalanobis`"

1-D ndarray of ufloat values

nv
36	@property
37	def nv(self):
38		"""Return the array of nominal values (read-only)."""
39		return _uc.unumpy.nominal_values(_np.array(self))

Return the array of nominal values (read-only).

se
41	@property
42	def se(self):
43		"""Return the array of standard errors (read-only)"""
44		return _uc.unumpy.std_devs(_np.array(self))

Return the array of standard errors (read-only)

correl
46	@property
47	def correl(self):
48		"""Return the correlation matrix of the array elements (read-only)"""
49		return _np.array(_uc.correlation_matrix(self))

Return the correlation matrix of the array elements (read-only)

covar
51	@property
52	def covar(self):
53		"""Return the covariance matrix of the array elements (read-only)"""
54		return _np.array(_uc.covariance_matrix(self))

Return the covariance matrix of the array elements (read-only)

mahalanobis
56	@property
57	def mahalanobis(self):
58		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
59		flatself = self.n.flatten().reshape((1, self.size))
60		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]

Return the squared Mahalanobis distance from zero of the array (read-only)

n
36	@property
37	def nv(self):
38		"""Return the array of nominal values (read-only)."""
39		return _uc.unumpy.nominal_values(_np.array(self))

Alias for uarray.nv

s
41	@property
42	def se(self):
43		"""Return the array of standard errors (read-only)"""
44		return _uc.unumpy.std_devs(_np.array(self))

Alias for uarray.se

cor
46	@property
47	def correl(self):
48		"""Return the correlation matrix of the array elements (read-only)"""
49		return _np.array(_uc.correlation_matrix(self))

Alias for uarray.correl

cov
51	@property
52	def covar(self):
53		"""Return the covariance matrix of the array elements (read-only)"""
54		return _np.array(_uc.covariance_matrix(self))

Alias for uarray.covar

m
56	@property
57	def mahalanobis(self):
58		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
59		flatself = self.n.flatten().reshape((1, self.size))
60		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Inherited Members
numpy.ndarray
dumps
dump
all
any
argmax
argmin
argpartition
argsort
astype
byteswap
choose
clip
compress
conj
conjugate
copy
cumprod
cumsum
diagonal
dot
fill
flatten
getfield
item
max
mean
min
nonzero
partition
prod
put
ravel
repeat
reshape
resize
round
searchsorted
setfield
setflags
sort
squeeze
std
sum
swapaxes
take
tobytes
tofile
tolist
tostring
trace
transpose
var
view
to_device
ndim
flags
shape
strides
data
itemsize
size
nbytes
base
dtype
real
imag
flat
ctypes
T
mT
ptp
newbyteorder
itemset
device
def is_symmetric_positive_semidefinite(M: numpy.ndarray) -> bool:
78def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
79	'''
80	Test whether 2-D array `M` is symmetric and positive semidefinite.
81	'''
82	ev = _np.linalg.eigvals(M)
83	return (
84		_np.allclose(M, M.T) # M is symmetric
85		and _np.all(
86			(ev > 0) | _np.isclose(ev, 0)
87		) # all eignevalues are either real and strictly positive or close to zero
88	)

Test whether 2-D array M is symmetric and positive semidefinite.

def smart_type(s: str) -> int | float | str:
 91def smart_type(s: str) -> (int | float | str):
 92	'''
 93	Tries to convert string `s` to an `int`, or to an `float` if that fails.
 94	If both fail, return the original string unchanged.
 95	'''
 96	try: return int(s)
 97	except: pass
 98	try: return float(s)
 99	except: pass
100	return s

Tries to convert string s to an int, or to an float if that fails. If both fail, return the original string unchanged.

def read_data(data: str, sep: str = ',', validate_covar: bool = True):
103def read_data(data: str, sep: str = ',', validate_covar: bool = True):
104	'''
105	Read correlated data from a CSV-like string.
106	
107	Column names are interpreted in the following way:
108	* In most cases, each columns is converted to a dict value, with the corresponding
109	dict key being the column's label.
110	* Columns whose label starts with `SE` are interpreted as specifying the standard
111	error for the latest preceding data column.
112	* Columns whose label starts with `correl` are interpreted as specifying the
113	correlation matrix for the latest preceding data column. In that case, column labels
114	are ignored for the rest of the columns belonging to this matrix.
115	* Columns whose label starts with `covar` are interpreted as specifying the
116	covariance matrix for the latest preceding data column. In that case, column labels
117	are ignored for the rest of the columns belonging to this matrix.
118	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
119	the latest preceding data column, by adding an underscore followed by the variable's
120	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
121	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
122	underscore followed by the two variable labels, joined by a second underscore
123	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
124	correspond, respectively, to the lines and columns of this matrix.
125	* Exceptions will be raised, for any given variable:
126		- when specifying both `covar` and any combination of (`SE`, `correl`)
127		- when specifying `correl` without `SE`
128
129	**Arguments**
130	- `data`: a CSV-like string
131	- `sep`: the CSV separator
132	- `validate_covar`: whether to check that the overall covariance matrix
133	is symmetric and positive semidefinite. Specifying `validate_covar = False`
134	bypasses this computationally expensive step.
135	
136	**Example**
137	```py
138	import correldata
139	data  = """
140	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
141	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
142	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
143	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
144	"""[1:-1]
145	print(correldata.read_data(data))
146	
147	# yields:
148	# 
149	# > {
150	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
151	#     'Tacid': array([90., 90., 90.]),
152	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
153	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
154	#   }
155	```
156	'''
157
158	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
159	N = len(data) - 1
160
161	values, se, correl, covar = {}, {}, {}, {}
162	j = 0
163	while j < len(data[0]):
164		field = data[0][j]
165		if not (
166			field.startswith('SE_')
167			or field.startswith('correl_')
168			or field.startswith('covar_')
169			or field == 'SE'
170			or field == 'correl'
171			or field == 'covar'
172			or len(field) == 0
173		):
174			values[field] = _np.array([l[j] for l in data[1:]])
175			j += 1
176			oldfield = field
177		elif field.startswith('SE_'):
178			se[field[3:]] = _np.array([l[j] for l in data[1:]])
179			j += 1
180		elif field == 'SE':
181			se[oldfield] = _np.array([l[j] for l in data[1:]])
182			j += 1
183		elif field.startswith('correl_'):
184			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
185			j += N
186		elif field == 'correl':
187			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
188			j += N
189		elif field.startswith('covar_'):
190			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
191			j += N
192		elif field == 'covar':
193			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
194			j += N
195
196	nakedvalues = {}
197	for k in [_ for _ in values]:
198		if (
199			k not in se
200			and k not in correl
201			and k not in covar
202		):
203			nakedvalues[k] = values.pop(k)
204
205	for x in values:
206		if x in covar:
207			if x in se:
208				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
209			if x in correl:
210				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
211		if x in correl:
212			if x not in se:
213				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
214
215	for x in correl:
216		if x in values:
217			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
218		else:
219			for x1 in values:
220				for x2 in values:
221					if x == f'{x1}_{x2}':
222						if x1 in se:
223							se1 = se[x1]
224						else:
225							if x1 in covar:
226								se1 = _np.diag(covar[x1])**0.5
227							else:
228								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
229						if x2 in se:
230							se2 = se[x2]
231						else:
232							if x2 in covar:
233								se2 = _np.diag(covar[x2])**0.5
234							else:
235								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
236
237						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
238
239	for x in se:
240		if x in values and x not in correl:
241			covar[x] = _np.diag(se[x]**2)
242
243	for k in [_ for _ in covar]:
244		if k not in values:
245			for j1 in values:
246				for j2 in values:
247					if k == f'{j1}_{j2}':
248						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
249
250	X = _np.array([_ for k in values for _ in values[k]])
251	CM = _np.zeros((X.size, X.size))
252	for i, vi in enumerate(values):
253		for j, vj in enumerate(values):
254			if vi == vj:
255				if vi in covar:
256					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
257			else:
258				if f'{vi}_{vj}' in covar:
259					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
260
261	s = _np.diag(CM)**.5
262	s[s==0] = 1.
263	invs = _np.diag(s**-1)
264
265	if (
266		validate_covar
267		and not (
268			is_symmetric_positive_semidefinite(CM)
269			or is_symmetric_positive_semidefinite(invs @ CM @ invs)
270		)
271	):
272		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
273
274	corvalues = uarray(_uc.correlated_values(X, CM))
275
276	allvalues = nakedvalues
277
278	for i, x in enumerate(values):
279		allvalues[x] = corvalues[i*N:i*N+N]
280
281	return allvalues

Read correlated data from a CSV-like string.

Column names are interpreted in the following way:

  • In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
  • Columns whose label starts with SE are interpreted as specifying the standard error for the latest preceding data column.
  • Columns whose label starts with correl are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • Columns whose label starts with covar are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • SE, correl, and covar may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex: SE_foo, correl_bar, covar_baz).
  • correl, and covar may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex: correl_foo_bar, covar_X_Y). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.
  • Exceptions will be raised, for any given variable:
    • when specifying both covar and any combination of (SE, correl)
    • when specifying correl without SE

Arguments

  • data: a CSV-like string
  • sep: the CSV separator
  • validate_covar: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifying validate_covar = False bypasses this computationally expensive step.

Example

import correldata
data  = """
Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
"""[1:-1]
print(read_data(data))

# yields:
# 
# > {
#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
#     'Tacid': array([90., 90., 90.]),
#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
#   }
def read_data_from_file(filename: str | os.PathLike, **kwargs):
284def read_data_from_file(filename: str | _os.PathLike, **kwargs):
285	'''
286	Read correlated data from a CSV file.
287
288	**Arguments**
289	- `filename`: `str` or path to the file to read from
290	- `kwargs`: passed to correldata.read_data()
291	'''
292	with open(filename) as fid:
293		return read_data(fid.read(), **kwargs)

Read correlated data from a CSV file.

Arguments

  • filename: str or path to the file to read from
  • kwargs: passed to read_data()
def f2s( x: Any, f: Union[str, Callable, dict], k: Hashable = None, fb: Union[str, Callable] = 'z.6g') -> str:
296def f2s(
297	x: Any,
298	f: (str | Callable | dict),
299	k: Hashable = None,
300	fb: (str | Callable) = 'z.6g',
301) -> str:
302	'''
303	Format `x` according to format `f`
304	
305	* If `f` is a string, return `f'{x:{f}}'`
306	* If `f` is a callable, return `f(x)`
307	* If `f` is a dict and optional argument `k` is a hashable,
308	  return f2s(x, f[k]), otherwise return f2s(x, fb)
309	'''
310	if isinstance (x, str):
311		return x
312	if isinstance (f, str):
313		return f'{x:{f}}'
314	if isinstance (f, Callable):
315		return f(x)
316	if isinstance (f, dict):
317		if k in f:
318			return f2s(x, f[k])
319		if isinstance (fb, str):
320			return f'{x:{fb}}'
321		if isinstance (fb, Callable):
322			return fb(x)
323	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')

Format x according to format f

  • If f is a string, return f'{x:{f}}'
  • If f is a callable, return f(x)
  • If f is a dict and optional argument k is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
def data_string( data: dict, sep: str = ',', include_fields: list = None, exclude_fields: list = [], float_format: Union[str, dict, Callable] = 'z.6g', correl_format: Union[str, dict, Callable] = 'z.6f', default_float_format: Union[str, Callable] = 'z.6g', default_correl_format: Union[str, Callable] = 'z.6f', show_nv: bool = True, show_se: bool = True, show_correl: bool = True, show_mixed_correl: bool = True, align: str = '>', atol: float = 1e-12, rtol: float = 1e-12):
327def data_string(
328	data: dict,
329	sep: str = ',',
330	include_fields: list = None,
331	exclude_fields: list = [],
332	float_format: (str | dict | Callable) = 'z.6g',
333	correl_format: (str | dict | Callable) = 'z.6f',
334	default_float_format: (str | Callable) = 'z.6g',
335	default_correl_format: (str | Callable) = 'z.6f',
336	show_nv: bool = True,
337	show_se: bool = True,
338	show_correl: bool = True,
339	show_mixed_correl: bool = True,
340	align: str = '>',
341	atol: float = 1e-12,
342	rtol: float = 1e-12,
343):
344	'''
345	Generate CSV-like string from correlated data
346
347	**Arguments**
348	- `data`: dict of arrays with strings, floats or correlated data
349	- `sep`: the CSV separator
350	- `include_fields`: subset of fields to write; if `None`, write all fields
351	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
352	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
353	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
354	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
355	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
356	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
357	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
358	  missing from `float_format.keys()` will use `default_float_format` instead.
359	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
360	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
361	- `show_nv`: show nominal values
362	- `show_se`: show standard errors
363	- `show_correl`: show correlations for any given field (ex: `correl_X`)
364	- `show_mixed_correl`:  show correlations between different fields (ex: `correl_X_Y`)
365	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
366	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
367	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
368	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
369	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
370	
371	
372	**Example**
373	
374	```py
375	from correldata import _uc
376	from correldata import _np
377	from correldata import *
378	
379	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
380	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
381	
382	data = dict(X=X, Y=Y, Z=X+Y)
383	
384	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
385	
386	# yields:
387	# 
388	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
389	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
390	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
391	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
392	```
393	'''
394	if include_fields is None:
395		include_fields = [_ for _ in data]
396	cols, ufields = [], []
397	for f in include_fields:
398		if f in exclude_fields:
399			continue
400		if isinstance(data[f], uarray):
401			ufields.append(f)
402			N = data[f].size
403			if show_nv:
404				cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
405			if show_se and (f'SE_{f}' not in exclude_fields):
406				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
407			if show_correl and (f'correl_{f}' not in exclude_fields):
408				CM = _uc.correlation_matrix(data[f])
409				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
410					for i in range(N):
411						cols.append(
412							['' if i else f'correl_{f}']
413							+ [
414								f2s(
415									CM[i,j],
416									correl_format,
417									f,
418									default_correl_format,
419								)
420								for j in range(N)
421							]
422						)
423		elif show_nv:
424				cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
425
426	if show_mixed_correl:
427		for i in range(len(ufields)):
428			for j in range(i):
429				if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
430					continue
431				CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
432				if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
433					for k in range(N):
434						cols.append(
435							['' if k else f'correl_{ufields[j]}_{ufields[i]}']
436							+ [
437								f2s(
438									CM[k,l],
439									correl_format,
440									f,
441									default_correl_format,
442								)
443								for l in range(N)
444							]
445						)
446
447	lines = list(map(list, zip(*cols)))
448
449	if align:
450		lengths = [max([len(e) for e in l]) for l in cols]
451		for l in lines:
452			for k,ln in enumerate(lengths):
453				l[k] = f'{l[k]:{align}{ln}s}'
454		return '\n'.join([(sep+' ').join(l) for l in lines])
455
456	return '\n'.join([sep.join(l) for l in lines])

Generate CSV-like string from correlated data

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • sep: the CSV separator
  • include_fields: subset of fields to write; if None, write all fields
  • exclude_fields: subset of fields to ignore (takes precedence over include_fields); to exclude only the SE for field foo, include SE_foo; same goes for correl_foo
  • float_format: formatting for float values. May be a string (ex: 'z.3f'), a callable (ex: lambda x: '.2f' if x else '0'), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex: {'foo': '.2e', 'bar': (lambda x: str(x))}).
  • correl_format: same as float_format, but applies to correlation matrix elements
  • default_float_format: only used when float_format is a dict; in that case, fields missing from float_format.keys() will use default_float_format instead. corresponding to different fields (ex: {'foo': '.2e', 'bar':lambda x: str(x)}).
  • default_correl_format: same as default_float_format, but applies to correl_format
  • show_nv: show nominal values
  • show_se: show standard errors
  • show_correl: show correlations for any given field (ex: correl_X)
  • show_mixed_correl: show correlations between different fields (ex: correl_X_Y)
  • align: right-align (>), left-align (<), or don't align (empty string) CSV values
  • atol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
  • rtol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix

Example

from correldata import _uc
from correldata import _np
from correldata import *

X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))

data = dict(X=X, Y=Y, Z=X+Y)

print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))

# yields:
# 
#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
def save_data_to_file(data, filename, **kwargs):
460def save_data_to_file(data, filename, **kwargs):
461	'''
462	Write correlated data to a CSV file.
463
464	**Arguments**
465	- `data`: dict of arrays with strings, floats or correlated data
466	- `filename`: `str` or path to the file to read from
467	- `kwargs`: passed to correldata.data_string()
468	'''
469	with open(filename, 'w') as fid:
470		return fid.write(data_string(data, **kwargs))

Write correlated data to a CSV file.

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • filename: str or path to the file to read from
  • kwargs: passed to data_string()