correldata

Read/write vectors of correlated data from/to a csv file.

These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.

  1"""
  2Read/write vectors of correlated data from/to a csv file.
  3
  4These data are stored in a dictionary, whose values are numpy arrays
  5with elements which may be strings, floats, or floats with associated uncertainties
  6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library.
  7"""
  8
  9
 10__author__    = 'Mathieu Daëron'
 11__contact__   = 'mathieu@daeron.fr'
 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron'
 13__license__   = 'MIT License - https://opensource.org/licenses/MIT'
 14__date__      = '2024-10-15'
 15__version__   = '1.3.0'
 16
 17
 18import os as _os
 19import numpy as _np
 20import uncertainties as _uc
 21
 22from typing import Callable, Hashable, Any
 23
 24class uarray(_np.ndarray):
 25
 26	__doc__ = """
 27	1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
 28	of [ufloat](https://pypi.org/project/uncertainties) values
 29	"""
 30
 31	def __new__(cls, a):
 32		obj = _np.asarray(a).view(cls)
 33		return obj
 34	
 35	@property
 36	def nv(self):
 37		"""Return the array of nominal values (read-only)."""
 38		return _uc.unumpy.nominal_values(_np.array(self))
 39
 40	@property
 41	def se(self):
 42		"""Return the array of standard errors (read-only)"""
 43		return _uc.unumpy.std_devs(_np.array(self))
 44
 45	@property
 46	def correl(self):
 47		"""Return the correlation matrix of the array elements (read-only)"""
 48		return _np.array(_uc.correlation_matrix(self))
 49
 50	@property
 51	def covar(self):
 52		"""Return the covariance matrix of the array elements (read-only)"""
 53		return _np.array(_uc.covariance_matrix(self))
 54	
 55	@property
 56	def mahalanobis(self):
 57		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
 58		flatself = self.n.flatten().reshape((1, self.size))
 59		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
 60	
 61	n = nv
 62	"Alias for `uarray.nv`"
 63	
 64	s = se
 65	"Alias for `uarray.se`"
 66	
 67	cor = correl
 68	"Alias for `uarray.correl`"
 69	
 70	cov = covar
 71	"Alias for `uarray.covar`"
 72	
 73	m = mahalanobis
 74	"Alias for `uarray.mahalanobis`"
 75
 76
 77def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
 78	'''
 79	Test whether 2-D array `M` is symmetric and positive semidefinite.
 80	'''
 81	ev = _np.linalg.eigvals(M)
 82	return (
 83		_np.allclose(M, M.T) # M is symmetric
 84		and _np.all(
 85			(ev > 0) | _np.isclose(ev, 0)
 86		) # all eignevalues are either real and strictly positive or close to zero
 87	)
 88
 89
 90def smart_type(s: str) -> (int | float | str):
 91	'''
 92	Tries to convert string `s` to an `int`, or to an `float` if that fails.
 93	If both fail, return the original string unchanged.
 94	'''
 95	try: return int(s)
 96	except: pass
 97	try: return float(s)
 98	except: pass
 99	return s
100
101
102def read_data(data: str, sep: str = ',', validate_covar: bool = True):
103	'''
104	Read correlated data from a CSV-like string.
105	
106	Column names are interpreted in the following way:
107	* In most cases, each columns is converted to a dict value, with the corresponding
108	dict key being the column's label.
109	* Columns whose label starts with `SE` are interpreted as specifying the standard
110	error for the latest preceding data column.
111	* Columns whose label starts with `correl` are interpreted as specifying the
112	correlation matrix for the latest preceding data column. In that case, column labels
113	are ignored for the rest of the columns belonging to this matrix.
114	* Columns whose label starts with `covar` are interpreted as specifying the
115	covariance matrix for the latest preceding data column. In that case, column labels
116	are ignored for the rest of the columns belonging to this matrix.
117	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
118	the latest preceding data column, by adding an underscore followed by the variable's
119	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
120	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
121	underscore followed by the two variable labels, joined by a second underscore
122	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
123	correspond, respectively, to the lines and columns of this matrix.
124	* Exceptions will be raised, for any given variable:
125		- when specifying both `covar` and any combination of (`SE`, `correl`)
126		- when specifying `correl` without `SE`
127
128	**Arguments**
129	- `data`: a CSV-like string
130	- `sep`: the CSV separator
131	- `validate_covar`: whether to check that the overall covariance matrix
132	is symmetric and positive semidefinite. Specifying `validate_covar = False`
133	bypasses this computationally expensive step.
134	
135	**Example**
136	```py
137	import correldata
138	data  = """
139	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
140	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
141	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
142	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
143	"""[1:-1]
144	print(correldata.read_data(data))
145	
146	# yields:
147	# 
148	# > {
149	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
150	#     'Tacid': array([90., 90., 90.]),
151	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
152	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
153	#   }
154	```
155	'''
156
157	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
158	N = len(data) - 1
159
160	values, se, correl, covar = {}, {}, {}, {}
161	j = 0
162	while j < len(data[0]):
163		field = data[0][j]
164		if not (
165			field.startswith('SE_')
166			or field.startswith('correl_')
167			or field.startswith('covar_')
168			or field == 'SE'
169			or field == 'correl'
170			or field == 'covar'
171			or len(field) == 0
172		):
173			values[field] = _np.array([l[j] for l in data[1:]])
174			j += 1
175			oldfield = field
176		elif field.startswith('SE_'):
177			se[field[3:]] = _np.array([l[j] for l in data[1:]])
178			j += 1
179		elif field == 'SE':
180			se[oldfield] = _np.array([l[j] for l in data[1:]])
181			j += 1
182		elif field.startswith('correl_'):
183			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
184			j += N
185		elif field == 'correl':
186			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
187			j += N
188		elif field.startswith('covar_'):
189			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
190			j += N
191		elif field == 'covar':
192			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
193			j += N
194
195	nakedvalues = {}
196	for k in [_ for _ in values]:
197		if (
198			k not in se
199			and k not in correl
200			and k not in covar
201		):
202			nakedvalues[k] = values.pop(k)
203
204	for x in values:
205		if x in covar:
206			if x in se:
207				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
208			if x in correl:
209				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
210		if x in correl:
211			if x not in se:
212				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
213
214	for x in correl:
215		if x in values:
216			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
217		else:
218			for x1 in values:
219				for x2 in values:
220					if x == f'{x1}_{x2}':
221						if x1 in se:
222							se1 = se[x1]
223						else:
224							if x1 in covar:
225								se1 = _np.diag(covar[x1])**0.5
226							else:
227								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
228						if x2 in se:
229							se2 = se[x2]
230						else:
231							if x2 in covar:
232								se2 = _np.diag(covar[x2])**0.5
233							else:
234								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
235
236						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
237
238	for x in se:
239		if x in values and x not in correl:
240			covar[x] = _np.diag(se[x]**2)
241
242	for k in [_ for _ in covar]:
243		if k not in values:
244			for j1 in values:
245				for j2 in values:
246					if k == f'{j1}_{j2}':
247						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
248
249	X = _np.array([_ for k in values for _ in values[k]])
250	CM = _np.zeros((X.size, X.size))
251	for i, vi in enumerate(values):
252		for j, vj in enumerate(values):
253			if vi == vj:
254				if vi in covar:
255					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
256			else:
257				if f'{vi}_{vj}' in covar:
258					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
259
260	s = _np.diag(CM)**.5
261	s[s==0] = 1.
262	invs = _np.diag(s**-1)
263
264	if (
265		validate_covar
266		and not (
267			is_symmetric_positive_semidefinite(CM)
268			or is_symmetric_positive_semidefinite(invs @ CM @ invs)
269		)
270	):
271		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
272
273	corvalues = uarray(_uc.correlated_values(X, CM))
274
275	allvalues = nakedvalues
276
277	for i, x in enumerate(values):
278		allvalues[x] = corvalues[i*N:i*N+N]
279
280	return allvalues
281
282
283def read_data_from_file(filename: str | _os.PathLike, **kwargs):
284	'''
285	Read correlated data from a CSV file.
286
287	**Arguments**
288	- `filename`: `str` or path to the file to read from
289	- `kwargs`: passed to correldata.read_data()
290	'''
291	with open(filename) as fid:
292		return read_data(fid.read(), **kwargs)
293
294
295def f2s(
296	x: Any,
297	f: (str | Callable | dict),
298	k: Hashable = None,
299	fb: (str | Callable) = 'z.6g',
300) -> str:
301	'''
302	Format `x` according to format `f`
303	
304	* If `f` is a string, return `f'{x:{f}}'`
305	* If `f` is a callable, return `f(x)`
306	* If `f` is a dict and optional argument `k` is a hashable,
307	  return f2s(x, f[k]), otherwise return f2s(x, fb)
308	'''
309
310	if isinstance (x, str):
311		return x
312	if isinstance (f, str):
313		return f'{x:{f}}'
314	if isinstance (f, Callable):
315		return f(x)
316	if isinstance (f, dict):
317		if k in f:
318			return f2s(x, f[k])
319		if isinstance (fb, str):
320			return f'{x:{fb}}'
321		if isinstance (fb, Callable):
322			return fb(x)
323	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
324	
325
326
327def data_string(
328	data: dict,
329	sep: str = ',',
330	include_fields: list = None,
331	exclude_fields: list = [],
332	float_format: (str | dict | Callable) = 'z.6g',
333	correl_format: (str | dict | Callable) = 'z.6f',
334	default_float_format: (str | Callable) = 'z.6g',
335	default_correl_format: (str | Callable) = 'z.6f',
336	align: str = '>',
337	atol: float = 1e-12,
338	rtol: float = 1e-12,
339):
340	'''
341	Generate CSV-like string from correlated data
342
343	**Arguments**
344	- `data`: dict of arrays with strings, floats or correlated data
345	- `sep`: the CSV separator
346	- `include_fields`: subset of fields to write; if `None`, write all fields
347	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
348	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
349	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
350	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
351	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
352	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
353	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
354	  missing from `float_format.keys()` will use `default_float_format` instead.
355	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
356	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
357	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
358	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
359	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
360	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
361	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
362	
363	
364	**Example**
365	
366	```py
367	from correldata import _uc
368	from correldata import _np
369	from correldata import *
370	
371	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
372	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
373	
374	data = dict(X=X, Y=Y, Z=X+Y)
375	
376	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
377	
378	# yields:
379	# 
380	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
381	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
382	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
383	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
384	```
385	'''
386	if include_fields is None:
387		include_fields = [_ for _ in data]
388	cols, ufields = [], []
389	for f in include_fields:
390		if f in exclude_fields:
391			continue
392		if isinstance(data[f], uarray):
393			ufields.append(f)
394			N = data[f].size
395			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
396			if f'SE_{f}' not in exclude_fields:
397				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
398			if f'correl_{f}' not in exclude_fields:
399				CM = _uc.correlation_matrix(data[f])
400				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
401					for i in range(N):
402						cols.append(
403							['' if i else f'correl_{f}']
404							+ [
405								f2s(
406									CM[i,j],
407									correl_format,
408									f,
409									default_correl_format,
410								)
411								for j in range(N)
412							]
413						)
414
415		else:
416			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
417
418	for i in range(len(ufields)):
419		for j in range(i):
420			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
421				continue
422			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
423			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
424				for k in range(N):
425					cols.append(
426						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
427						+ [
428							f2s(
429								CM[k,l],
430								correl_format,
431								f,
432								default_correl_format,
433							)
434							for l in range(N)
435						]
436					)
437
438	lines = list(map(list, zip(*cols)))
439
440	if align:
441		lengths = [max([len(e) for e in l]) for l in cols]
442		for l in lines:
443			for k,ln in enumerate(lengths):
444				l[k] = f'{l[k]:{align}{ln}s}'
445		return '\n'.join([(sep+' ').join(l) for l in lines])
446
447	return '\n'.join([sep.join(l) for l in lines])
448
449
450
451def save_data_to_file(data, filename, **kwargs):
452	'''
453	Write correlated data to a CSV file.
454
455	**Arguments**
456	- `data`: dict of arrays with strings, floats or correlated data
457	- `filename`: `str` or path to the file to read from
458	- `kwargs`: passed to correldata.data_string()
459	'''
460	with open(filename, 'w') as fid:
461		return fid.write(data_string(data, **kwargs))
class uarray(numpy.ndarray):
25class uarray(_np.ndarray):
26
27	__doc__ = """
28	1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
29	of [ufloat](https://pypi.org/project/uncertainties) values
30	"""
31
32	def __new__(cls, a):
33		obj = _np.asarray(a).view(cls)
34		return obj
35	
36	@property
37	def nv(self):
38		"""Return the array of nominal values (read-only)."""
39		return _uc.unumpy.nominal_values(_np.array(self))
40
41	@property
42	def se(self):
43		"""Return the array of standard errors (read-only)"""
44		return _uc.unumpy.std_devs(_np.array(self))
45
46	@property
47	def correl(self):
48		"""Return the correlation matrix of the array elements (read-only)"""
49		return _np.array(_uc.correlation_matrix(self))
50
51	@property
52	def covar(self):
53		"""Return the covariance matrix of the array elements (read-only)"""
54		return _np.array(_uc.covariance_matrix(self))
55	
56	@property
57	def mahalanobis(self):
58		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
59		flatself = self.n.flatten().reshape((1, self.size))
60		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
61	
62	n = nv
63	"Alias for `uarray.nv`"
64	
65	s = se
66	"Alias for `uarray.se`"
67	
68	cor = correl
69	"Alias for `uarray.correl`"
70	
71	cov = covar
72	"Alias for `uarray.covar`"
73	
74	m = mahalanobis
75	"Alias for `uarray.mahalanobis`"

1-D ndarray of ufloat values

nv
36	@property
37	def nv(self):
38		"""Return the array of nominal values (read-only)."""
39		return _uc.unumpy.nominal_values(_np.array(self))

Return the array of nominal values (read-only).

se
41	@property
42	def se(self):
43		"""Return the array of standard errors (read-only)"""
44		return _uc.unumpy.std_devs(_np.array(self))

Return the array of standard errors (read-only)

correl
46	@property
47	def correl(self):
48		"""Return the correlation matrix of the array elements (read-only)"""
49		return _np.array(_uc.correlation_matrix(self))

Return the correlation matrix of the array elements (read-only)

covar
51	@property
52	def covar(self):
53		"""Return the covariance matrix of the array elements (read-only)"""
54		return _np.array(_uc.covariance_matrix(self))

Return the covariance matrix of the array elements (read-only)

mahalanobis
56	@property
57	def mahalanobis(self):
58		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
59		flatself = self.n.flatten().reshape((1, self.size))
60		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]

Return the squared Mahalanobis distance from zero of the array (read-only)

n
36	@property
37	def nv(self):
38		"""Return the array of nominal values (read-only)."""
39		return _uc.unumpy.nominal_values(_np.array(self))

Alias for uarray.nv

s
41	@property
42	def se(self):
43		"""Return the array of standard errors (read-only)"""
44		return _uc.unumpy.std_devs(_np.array(self))

Alias for uarray.se

cor
46	@property
47	def correl(self):
48		"""Return the correlation matrix of the array elements (read-only)"""
49		return _np.array(_uc.correlation_matrix(self))

Alias for uarray.correl

cov
51	@property
52	def covar(self):
53		"""Return the covariance matrix of the array elements (read-only)"""
54		return _np.array(_uc.covariance_matrix(self))

Alias for uarray.covar

m
56	@property
57	def mahalanobis(self):
58		"""Return the squared Mahalanobis distance from zero of the array (read-only)"""
59		flatself = self.n.flatten().reshape((1, self.size))
60		return (flatself @ _np.linalg.inv(self.covar) @ flatself.T)[0,0]
Inherited Members
numpy.ndarray
dumps
dump
all
any
argmax
argmin
argpartition
argsort
astype
byteswap
choose
clip
compress
conj
conjugate
copy
cumprod
cumsum
diagonal
dot
fill
flatten
getfield
item
max
mean
min
nonzero
partition
prod
put
ravel
repeat
reshape
resize
round
searchsorted
setfield
setflags
sort
squeeze
std
sum
swapaxes
take
tobytes
tofile
tolist
tostring
trace
transpose
var
view
to_device
ndim
flags
shape
strides
data
itemsize
size
nbytes
base
dtype
real
imag
flat
ctypes
T
mT
ptp
newbyteorder
itemset
device
def is_symmetric_positive_semidefinite(M: numpy.ndarray) -> bool:
78def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
79	'''
80	Test whether 2-D array `M` is symmetric and positive semidefinite.
81	'''
82	ev = _np.linalg.eigvals(M)
83	return (
84		_np.allclose(M, M.T) # M is symmetric
85		and _np.all(
86			(ev > 0) | _np.isclose(ev, 0)
87		) # all eignevalues are either real and strictly positive or close to zero
88	)

Test whether 2-D array M is symmetric and positive semidefinite.

def smart_type(s: str) -> int | float | str:
 91def smart_type(s: str) -> (int | float | str):
 92	'''
 93	Tries to convert string `s` to an `int`, or to an `float` if that fails.
 94	If both fail, return the original string unchanged.
 95	'''
 96	try: return int(s)
 97	except: pass
 98	try: return float(s)
 99	except: pass
100	return s

Tries to convert string s to an int, or to an float if that fails. If both fail, return the original string unchanged.

def read_data(data: str, sep: str = ',', validate_covar: bool = True):
103def read_data(data: str, sep: str = ',', validate_covar: bool = True):
104	'''
105	Read correlated data from a CSV-like string.
106	
107	Column names are interpreted in the following way:
108	* In most cases, each columns is converted to a dict value, with the corresponding
109	dict key being the column's label.
110	* Columns whose label starts with `SE` are interpreted as specifying the standard
111	error for the latest preceding data column.
112	* Columns whose label starts with `correl` are interpreted as specifying the
113	correlation matrix for the latest preceding data column. In that case, column labels
114	are ignored for the rest of the columns belonging to this matrix.
115	* Columns whose label starts with `covar` are interpreted as specifying the
116	covariance matrix for the latest preceding data column. In that case, column labels
117	are ignored for the rest of the columns belonging to this matrix.
118	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
119	the latest preceding data column, by adding an underscore followed by the variable's
120	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
121	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
122	underscore followed by the two variable labels, joined by a second underscore
123	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
124	correspond, respectively, to the lines and columns of this matrix.
125	* Exceptions will be raised, for any given variable:
126		- when specifying both `covar` and any combination of (`SE`, `correl`)
127		- when specifying `correl` without `SE`
128
129	**Arguments**
130	- `data`: a CSV-like string
131	- `sep`: the CSV separator
132	- `validate_covar`: whether to check that the overall covariance matrix
133	is symmetric and positive semidefinite. Specifying `validate_covar = False`
134	bypasses this computationally expensive step.
135	
136	**Example**
137	```py
138	import correldata
139	data  = """
140	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
141	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
142	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
143	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
144	"""[1:-1]
145	print(correldata.read_data(data))
146	
147	# yields:
148	# 
149	# > {
150	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
151	#     'Tacid': array([90., 90., 90.]),
152	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
153	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
154	#   }
155	```
156	'''
157
158	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
159	N = len(data) - 1
160
161	values, se, correl, covar = {}, {}, {}, {}
162	j = 0
163	while j < len(data[0]):
164		field = data[0][j]
165		if not (
166			field.startswith('SE_')
167			or field.startswith('correl_')
168			or field.startswith('covar_')
169			or field == 'SE'
170			or field == 'correl'
171			or field == 'covar'
172			or len(field) == 0
173		):
174			values[field] = _np.array([l[j] for l in data[1:]])
175			j += 1
176			oldfield = field
177		elif field.startswith('SE_'):
178			se[field[3:]] = _np.array([l[j] for l in data[1:]])
179			j += 1
180		elif field == 'SE':
181			se[oldfield] = _np.array([l[j] for l in data[1:]])
182			j += 1
183		elif field.startswith('correl_'):
184			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
185			j += N
186		elif field == 'correl':
187			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
188			j += N
189		elif field.startswith('covar_'):
190			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
191			j += N
192		elif field == 'covar':
193			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
194			j += N
195
196	nakedvalues = {}
197	for k in [_ for _ in values]:
198		if (
199			k not in se
200			and k not in correl
201			and k not in covar
202		):
203			nakedvalues[k] = values.pop(k)
204
205	for x in values:
206		if x in covar:
207			if x in se:
208				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
209			if x in correl:
210				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
211		if x in correl:
212			if x not in se:
213				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
214
215	for x in correl:
216		if x in values:
217			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
218		else:
219			for x1 in values:
220				for x2 in values:
221					if x == f'{x1}_{x2}':
222						if x1 in se:
223							se1 = se[x1]
224						else:
225							if x1 in covar:
226								se1 = _np.diag(covar[x1])**0.5
227							else:
228								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
229						if x2 in se:
230							se2 = se[x2]
231						else:
232							if x2 in covar:
233								se2 = _np.diag(covar[x2])**0.5
234							else:
235								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
236
237						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
238
239	for x in se:
240		if x in values and x not in correl:
241			covar[x] = _np.diag(se[x]**2)
242
243	for k in [_ for _ in covar]:
244		if k not in values:
245			for j1 in values:
246				for j2 in values:
247					if k == f'{j1}_{j2}':
248						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
249
250	X = _np.array([_ for k in values for _ in values[k]])
251	CM = _np.zeros((X.size, X.size))
252	for i, vi in enumerate(values):
253		for j, vj in enumerate(values):
254			if vi == vj:
255				if vi in covar:
256					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
257			else:
258				if f'{vi}_{vj}' in covar:
259					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
260
261	s = _np.diag(CM)**.5
262	s[s==0] = 1.
263	invs = _np.diag(s**-1)
264
265	if (
266		validate_covar
267		and not (
268			is_symmetric_positive_semidefinite(CM)
269			or is_symmetric_positive_semidefinite(invs @ CM @ invs)
270		)
271	):
272		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
273
274	corvalues = uarray(_uc.correlated_values(X, CM))
275
276	allvalues = nakedvalues
277
278	for i, x in enumerate(values):
279		allvalues[x] = corvalues[i*N:i*N+N]
280
281	return allvalues

Read correlated data from a CSV-like string.

Column names are interpreted in the following way:

  • In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
  • Columns whose label starts with SE are interpreted as specifying the standard error for the latest preceding data column.
  • Columns whose label starts with correl are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • Columns whose label starts with covar are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • SE, correl, and covar may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex: SE_foo, correl_bar, covar_baz).
  • correl, and covar may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex: correl_foo_bar, covar_X_Y). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.
  • Exceptions will be raised, for any given variable:
    • when specifying both covar and any combination of (SE, correl)
    • when specifying correl without SE

Arguments

  • data: a CSV-like string
  • sep: the CSV separator
  • validate_covar: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifying validate_covar = False bypasses this computationally expensive step.

Example

import correldata
data  = """
Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
"""[1:-1]
print(read_data(data))

# yields:
# 
# > {
#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
#     'Tacid': array([90., 90., 90.]),
#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
#   }
def read_data_from_file(filename: str | os.PathLike, **kwargs):
284def read_data_from_file(filename: str | _os.PathLike, **kwargs):
285	'''
286	Read correlated data from a CSV file.
287
288	**Arguments**
289	- `filename`: `str` or path to the file to read from
290	- `kwargs`: passed to correldata.read_data()
291	'''
292	with open(filename) as fid:
293		return read_data(fid.read(), **kwargs)

Read correlated data from a CSV file.

Arguments

  • filename: str or path to the file to read from
  • kwargs: passed to read_data()
def f2s( x: Any, f: Union[str, Callable, dict], k: Hashable = None, fb: Union[str, Callable] = 'z.6g') -> str:
296def f2s(
297	x: Any,
298	f: (str | Callable | dict),
299	k: Hashable = None,
300	fb: (str | Callable) = 'z.6g',
301) -> str:
302	'''
303	Format `x` according to format `f`
304	
305	* If `f` is a string, return `f'{x:{f}}'`
306	* If `f` is a callable, return `f(x)`
307	* If `f` is a dict and optional argument `k` is a hashable,
308	  return f2s(x, f[k]), otherwise return f2s(x, fb)
309	'''
310
311	if isinstance (x, str):
312		return x
313	if isinstance (f, str):
314		return f'{x:{f}}'
315	if isinstance (f, Callable):
316		return f(x)
317	if isinstance (f, dict):
318		if k in f:
319			return f2s(x, f[k])
320		if isinstance (fb, str):
321			return f'{x:{fb}}'
322		if isinstance (fb, Callable):
323			return fb(x)
324	raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')

Format x according to format f

  • If f is a string, return f'{x:{f}}'
  • If f is a callable, return f(x)
  • If f is a dict and optional argument k is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
def data_string( data: dict, sep: str = ',', include_fields: list = None, exclude_fields: list = [], float_format: Union[str, dict, Callable] = 'z.6g', correl_format: Union[str, dict, Callable] = 'z.6f', default_float_format: Union[str, Callable] = 'z.6g', default_correl_format: Union[str, Callable] = 'z.6f', align: str = '>', atol: float = 1e-12, rtol: float = 1e-12):
328def data_string(
329	data: dict,
330	sep: str = ',',
331	include_fields: list = None,
332	exclude_fields: list = [],
333	float_format: (str | dict | Callable) = 'z.6g',
334	correl_format: (str | dict | Callable) = 'z.6f',
335	default_float_format: (str | Callable) = 'z.6g',
336	default_correl_format: (str | Callable) = 'z.6f',
337	align: str = '>',
338	atol: float = 1e-12,
339	rtol: float = 1e-12,
340):
341	'''
342	Generate CSV-like string from correlated data
343
344	**Arguments**
345	- `data`: dict of arrays with strings, floats or correlated data
346	- `sep`: the CSV separator
347	- `include_fields`: subset of fields to write; if `None`, write all fields
348	- `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`);
349	  to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo`
350	- `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable
351	  (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys
352	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`).
353	- `correl_format`: same as `float_format`, but applies to correlation matrix elements
354	- `default_float_format`: only used when `float_format` is a dict; in that case, fields
355	  missing from `float_format.keys()` will use `default_float_format` instead.
356	  corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`).
357	- `default_correl_format`: same as `default_float_format`, but applies to `correl_format`
358	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
359	- `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
360	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
361	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html)
362	  when deciding whether a matrix is equal to the identity matrix or to the zero matrix
363	
364	
365	**Example**
366	
367	```py
368	from correldata import _uc
369	from correldata import _np
370	from correldata import *
371	
372	X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
373	Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
374	
375	data = dict(X=X, Y=Y, Z=X+Y)
376	
377	print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
378	
379	# yields:
380	# 
381	#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
382	# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
383	# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
384	# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
385	```
386	'''
387	if include_fields is None:
388		include_fields = [_ for _ in data]
389	cols, ufields = [], []
390	for f in include_fields:
391		if f in exclude_fields:
392			continue
393		if isinstance(data[f], uarray):
394			ufields.append(f)
395			N = data[f].size
396			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n])
397			if f'SE_{f}' not in exclude_fields:
398				cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s])
399			if f'correl_{f}' not in exclude_fields:
400				CM = _uc.correlation_matrix(data[f])
401				if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
402					for i in range(N):
403						cols.append(
404							['' if i else f'correl_{f}']
405							+ [
406								f2s(
407									CM[i,j],
408									correl_format,
409									f,
410									default_correl_format,
411								)
412								for j in range(N)
413							]
414						)
415
416		else:
417			cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]])
418
419	for i in range(len(ufields)):
420		for j in range(i):
421			if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields:
422				continue
423			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:]
424			if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol):
425				for k in range(N):
426					cols.append(
427						['' if k else f'correl_{ufields[j]}_{ufields[i]}']
428						+ [
429							f2s(
430								CM[k,l],
431								correl_format,
432								f,
433								default_correl_format,
434							)
435							for l in range(N)
436						]
437					)
438
439	lines = list(map(list, zip(*cols)))
440
441	if align:
442		lengths = [max([len(e) for e in l]) for l in cols]
443		for l in lines:
444			for k,ln in enumerate(lengths):
445				l[k] = f'{l[k]:{align}{ln}s}'
446		return '\n'.join([(sep+' ').join(l) for l in lines])
447
448	return '\n'.join([sep.join(l) for l in lines])

Generate CSV-like string from correlated data

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • sep: the CSV separator
  • include_fields: subset of fields to write; if None, write all fields
  • exclude_fields: subset of fields to ignore (takes precedence over include_fields); to exclude only the SE for field foo, include SE_foo; same goes for correl_foo
  • float_format: formatting for float values. May be a string (ex: 'z.3f'), a callable (ex: lambda x: '.2f' if x else '0'), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex: {'foo': '.2e', 'bar': (lambda x: str(x))}).
  • correl_format: same as float_format, but applies to correlation matrix elements
  • default_float_format: only used when float_format is a dict; in that case, fields missing from float_format.keys() will use default_float_format instead. corresponding to different fields (ex: {'foo': '.2e', 'bar':lambda x: str(x)}).
  • default_correl_format: same as default_float_format, but applies to correl_format
  • align: right-align (>), left-align (<), or don't align (empty string) CSV values
  • atol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
  • rtol: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix

Example

from correldata import _uc
from correldata import _np
from correldata import *

X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))

data = dict(X=X, Y=Y, Z=X+Y)

print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))

# yields:
# 
#   X, SE_X,   Y, SE_Y,   Z, SE_Z, correl_X_Z,    ,    , correl_Y_Z,    ,    
# 1.0,  0.3, 4.0,  0.4, 5.0,  0.5,        0.6, 0.0, 0.0,        0.8, 0.0, 0.0
# 2.0,  0.3, 5.0,  0.4, 7.0,  0.5,        0.0, 0.6, 0.0,        0.0, 0.8, 0.0
# 3.0,  0.3, 6.0,  0.4, 9.0,  0.5,        0.0, 0.0, 0.6,        0.0, 0.0, 0.8
def save_data_to_file(data, filename, **kwargs):
452def save_data_to_file(data, filename, **kwargs):
453	'''
454	Write correlated data to a CSV file.
455
456	**Arguments**
457	- `data`: dict of arrays with strings, floats or correlated data
458	- `filename`: `str` or path to the file to read from
459	- `kwargs`: passed to correldata.data_string()
460	'''
461	with open(filename, 'w') as fid:
462		return fid.write(data_string(data, **kwargs))

Write correlated data to a CSV file.

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • filename: str or path to the file to read from
  • kwargs: passed to data_string()