correldata

Read/write vectors of correlated data from/to a csv file.

These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.

  1"""
  2Read/write vectors of correlated data from/to a csv file.
  3
  4These data are stored in a dictionary, whose values are numpy arrays
  5with elements which may be strings, floats, or floats with associated uncertainties
  6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library.
  7"""
  8
  9
 10__author__    = 'Mathieu Daëron'
 11__contact__   = 'mathieu@daeron.fr'
 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron'
 13__license__   = 'MIT License - https://opensource.org/licenses/MIT'
 14__date__      = '2024-10-09'
 15__version__   = '1.1.0'
 16
 17
 18import os as _os
 19import numpy as _np
 20import uncertainties as _uc
 21
 22
 23class uarray(_np.ndarray):
 24
 25    __doc__ = """
 26    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
 27    of [ufloat](https://pypi.org/project/uncertainties) values
 28    """
 29
 30    def __new__(cls, a):
 31        obj = _np.asarray(a).view(cls)
 32        return obj
 33    
 34    n = property(fget = _np.vectorize(lambda x : x.n))
 35    """Return the array of nominal values (read-only)."""
 36
 37    s = property(fget = _np.vectorize(lambda x : x.s))
 38    """Return the array of standard errors (read-only)"""
 39
 40    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
 41    """Return the correlation matrix of the array elements (read-only)"""
 42
 43    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
 44    """Return the covariance matrix of the array elements (read-only)"""
 45
 46    nv = n
 47    "Alias for `uarray.nv`"
 48
 49    se = s
 50    "Alias for `uarray.s`"
 51
 52    cor = correl
 53    "Alias for `uarray.correl`"
 54
 55    cov = covar
 56    "Alias for `uarray.covar`"    
 57
 58
 59def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
 60	'''
 61	Test whether 2-D array `M` is symmetric and positive semidefinite.
 62	'''
 63	return _np.all(_np.linalg.eigvals(M) >= 0) and _np.all(M - M.T == 0)
 64
 65
 66def smart_type(x: str):
 67	'''
 68	Tries to convert string `x` to a float if it includes a decimal point, or
 69	to an integer if it does not. If both attempts fail, return the original
 70	string unchanged.
 71	'''
 72	try:
 73		y = float(x)
 74	except ValueError:
 75		return x
 76	if y % 1 == 0 and '.' not in x:
 77		return int(y)
 78	return y
 79
 80
 81def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 82	'''
 83	Read correlated data from a CSV-like string.
 84	
 85	Column names are interpreted in the following way:
 86	* In most cases, each columns is converted to a dict value, with the corresponding
 87	dict key being the column's label.
 88	* Columns whose label starts with `SE` are interpreted as specifying the standard
 89	error for the latest preceding data column.
 90	* Columns whose label starts with `correl` are interpreted as specifying the
 91	correlation matrix for the latest preceding data column. In that case, column labels
 92	are ignored for the rest of the columns belonging to this matrix.
 93	* Columns whose label starts with `covar` are interpreted as specifying the
 94	covariance matrix for the latest preceding data column. In that case, column labels
 95	are ignored for the rest of the columns belonging to this matrix.
 96	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
 97	the latest preceding data column, by adding an underscore followed by the variable's
 98	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
 99	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
100	underscore followed by the two variable labels, joined by a second underscore
101	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
102	correspond, respectively, to the lines and columns of this matrix.
103	* Exceptions will be raised, for any given variable:
104		- when specifying both `covar` and any combination of (`SE`, `correl`)
105		- when specifying `correl` without `SE`
106
107	**Arguments**
108	- `data`: a CSV-like string
109	- `sep`: the CSV separator
110	- `validate_covar`: whether to check that the overall covariance matrix
111	is symmetric and positive semidefinite. Specifying `validate_covar = False`
112	bypasses this computationally expensive step.
113	
114	**Example**
115	```py
116	import correldata
117	data  = """
118	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
119	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
120	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
121	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
122	"""[1:-1]
123	print(correldata.read_data(data))
124	
125	# yields:
126	# 
127	# > {
128	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
129	#     'Tacid': array([90., 90., 90.]),
130	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
131	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
132	#   }
133	```
134	'''
135
136	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
137	N = len(data) - 1
138
139	values, se, correl, covar = {}, {}, {}, {}
140	j = 0
141	while j < len(data[0]):
142		field = data[0][j]
143		if not (
144			field.startswith('SE_')
145			or field.startswith('correl_')
146			or field.startswith('covar_')
147			or field == 'SE'
148			or field == 'correl'
149			or field == 'covar'
150			or len(field) == 0
151		):
152			values[field] = _np.array([l[j] for l in data[1:]])
153			j += 1
154			oldfield = field
155		elif field.startswith('SE_'):
156			se[field[3:]] = _np.array([l[j] for l in data[1:]])
157			j += 1
158		elif field == 'SE':
159			se[oldfield] = _np.array([l[j] for l in data[1:]])
160			j += 1
161		elif field.startswith('correl_'):
162			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
163			j += N
164		elif field == 'correl':
165			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
166			j += N
167		elif field.startswith('covar_'):
168			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
169			j += N
170		elif field == 'covar':
171			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
172			j += N
173
174	nakedvalues = {}
175	for k in [_ for _ in values]:
176		if (
177			k not in se
178			and k not in correl
179			and k not in covar
180		):
181			nakedvalues[k] = values.pop(k)
182
183	for x in values:
184		if x in covar:
185			if x in se:
186				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
187			if x in correl:
188				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
189		if x in correl:
190			if x not in se:
191				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
192
193	for x in correl:
194		if x in values:
195			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
196		else:
197			for x1 in values:
198				for x2 in values:
199					if x == f'{x1}_{x2}':
200						if x1 in se:
201							se1 = se[x1]
202						else:
203							if x1 in covar:
204								se1 = _np.diag(covar[x1])**0.5
205							else:
206								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
207						if x2 in se:
208							se2 = se[x2]
209						else:
210							if x2 in covar:
211								se2 = _np.diag(covar[x2])**0.5
212							else:
213								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
214
215						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
216
217	for x in se:
218		if x in values and x not in correl:
219			covar[x] = _np.diag(se[x]**2)
220
221	for k in [_ for _ in covar]:
222		if k not in values:
223			for j1 in values:
224				for j2 in values:
225					if k == f'{j1}_{j2}':
226						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
227
228	X = _np.array([_ for k in values for _ in values[k]])
229	CM = _np.zeros((X.size, X.size))
230	for i, vi in enumerate(values):
231		for j, vj in enumerate(values):
232			if vi == vj:
233				if vi in covar:
234					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
235			else:
236				if f'{vi}_{vj}' in covar:
237					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
238
239	if validate_covar and not is_symmetric_positive_semidefinite(CM):
240		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
241
242	corvalues = uarray(_uc.correlated_values(X, CM))
243
244	allvalues = nakedvalues
245
246	for i, x in enumerate(values):
247		allvalues[x] = corvalues[i*N:i*N+N]
248
249	return allvalues
250
251
252def read_data_from_file(filename: str | _os.PathLike, **kwargs):
253	'''
254	Read correlated data from a CSV file.
255
256	**Arguments**
257	- `filename`: `str` or path to the file to read from
258	- `kwargs`: passed to correldata.read_data()
259	'''
260	with open(filename) as fid:
261		return read_data(fid.read(), **kwargs)
262
263def data_string(
264	data: dict,
265	sep: str = ',',
266	float_fmt: str = 'zg',
267	max_correl_precision: int = 9,
268	fields: list = None,
269	align: str = '>',
270	atol: float = 1e-12,
271	rtol: float = 1e-12,
272):
273	'''
274	Generate CSV-like string from correlated data
275
276	**Arguments**
277	- `data`: dict of arrays with strings, floats or correlated data
278	- `sep`: the CSV separator
279	- `float_fmt`: formatting string for float values
280	- `max_correl_precision`: number of post-decimal digits for correlation values
281	- `fields`: subset of fields to write; if `None`, write all fields
282	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
283	- `atol`: passed to _np.allclose(),
284	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html),
285	'''
286	if fields is None:
287		fields = [_ for _ in data]
288	cols, ufields = [], []
289	for f in fields:
290		if isinstance(data[f], uarray):
291			ufields.append(f)
292			N = data[f].size
293			cols.append([f] + [f'{_.n:{float_fmt}}' for _ in data[f]])
294			cols.append([f'SE_{f}'] + [f'{_.s:{float_fmt}}' for _ in data[f]])
295			CM = _uc.correlation_matrix(data[f])
296			if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
297				for i in range(N):
298					cols.append(['' if i else f'correl_{f}'] + [f'{CM[i,j] if abs(CM[i,j]) > atol else 0:z.{max_correl_precision}f}'.rstrip('0') for j in range(N)])
299
300		else:
301			cols.append([f] + [str(_) for _ in data[f]])
302
303	for i in range(len(ufields)):
304		for j in range(i):
305			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N,N:]
306			if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
307				for k in range(N):
308					cols.append(['' if k else f'correl_{ufields[i]}_{ufields[j]}'] + [f'{CM[k,l] if abs(CM[k,l]) > atol else 0:z.{max_correl_precision}f}'.rstrip('0') for l in range(N)])
309
310	lines = list(map(list, zip(*cols)))
311
312	if align:
313		lengths = [max([len(e) for e in l]) for l in cols]
314		for l in lines:
315			for k,ln in enumerate(lengths):
316				l[k] = f'{l[k]:{align}{ln}s}'
317		return '\n'.join([(sep+' ').join(l) for l in lines])
318
319	return '\n'.join([sep.join(l) for l in lines])
320
321
322
323def save_data_to_file(data, filename, **kwargs):
324	'''
325	Write correlated data to a CSV file.
326
327	**Arguments**
328	- `data`: dict of arrays with strings, floats or correlated data
329	- `filename`: `str` or path to the file to read from
330	- `kwargs`: passed to correldata.data_string()
331	'''
332	with open(filename, 'w') as fid:
333		return fid.write(data_string(data, **kwargs))
class uarray(numpy.ndarray):
24class uarray(_np.ndarray):
25
26    __doc__ = """
27    1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)
28    of [ufloat](https://pypi.org/project/uncertainties) values
29    """
30
31    def __new__(cls, a):
32        obj = _np.asarray(a).view(cls)
33        return obj
34    
35    n = property(fget = _np.vectorize(lambda x : x.n))
36    """Return the array of nominal values (read-only)."""
37
38    s = property(fget = _np.vectorize(lambda x : x.s))
39    """Return the array of standard errors (read-only)"""
40
41    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
42    """Return the correlation matrix of the array elements (read-only)"""
43
44    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
45    """Return the covariance matrix of the array elements (read-only)"""
46
47    nv = n
48    "Alias for `uarray.nv`"
49
50    se = s
51    "Alias for `uarray.s`"
52
53    cor = correl
54    "Alias for `uarray.correl`"
55
56    cov = covar
57    "Alias for `uarray.covar`"    

1-D ndarray of ufloat values

n

Return the array of nominal values (read-only).

s

Return the array of standard errors (read-only)

correl
41    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Return the correlation matrix of the array elements (read-only)

covar
44    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Return the covariance matrix of the array elements (read-only)

nv

Alias for uarray.nv

se

Alias for uarray.s

cor
41    correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))

Alias for uarray.correl

cov
44    covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))

Alias for uarray.covar

Inherited Members
numpy.ndarray
dumps
dump
all
any
argmax
argmin
argpartition
argsort
astype
byteswap
choose
clip
compress
conj
conjugate
copy
cumprod
cumsum
diagonal
dot
fill
flatten
getfield
item
max
mean
min
nonzero
partition
prod
put
ravel
repeat
reshape
resize
round
searchsorted
setfield
setflags
sort
squeeze
std
sum
swapaxes
take
tobytes
tofile
tolist
tostring
trace
transpose
var
view
to_device
ndim
flags
shape
strides
data
itemsize
size
nbytes
base
dtype
real
imag
flat
ctypes
T
mT
ptp
newbyteorder
itemset
device
def is_symmetric_positive_semidefinite(M: numpy.ndarray) -> bool:
60def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool:
61	'''
62	Test whether 2-D array `M` is symmetric and positive semidefinite.
63	'''
64	return _np.all(_np.linalg.eigvals(M) >= 0) and _np.all(M - M.T == 0)

Test whether 2-D array M is symmetric and positive semidefinite.

def smart_type(x: str):
67def smart_type(x: str):
68	'''
69	Tries to convert string `x` to a float if it includes a decimal point, or
70	to an integer if it does not. If both attempts fail, return the original
71	string unchanged.
72	'''
73	try:
74		y = float(x)
75	except ValueError:
76		return x
77	if y % 1 == 0 and '.' not in x:
78		return int(y)
79	return y

Tries to convert string x to a float if it includes a decimal point, or to an integer if it does not. If both attempts fail, return the original string unchanged.

def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 82def read_data(data: str, sep: str = ',', validate_covar: bool = True):
 83	'''
 84	Read correlated data from a CSV-like string.
 85	
 86	Column names are interpreted in the following way:
 87	* In most cases, each columns is converted to a dict value, with the corresponding
 88	dict key being the column's label.
 89	* Columns whose label starts with `SE` are interpreted as specifying the standard
 90	error for the latest preceding data column.
 91	* Columns whose label starts with `correl` are interpreted as specifying the
 92	correlation matrix for the latest preceding data column. In that case, column labels
 93	are ignored for the rest of the columns belonging to this matrix.
 94	* Columns whose label starts with `covar` are interpreted as specifying the
 95	covariance matrix for the latest preceding data column. In that case, column labels
 96	are ignored for the rest of the columns belonging to this matrix.
 97	* `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than
 98	the latest preceding data column, by adding an underscore followed by the variable's
 99	label (ex: `SE_foo`, `correl_bar`, `covar_baz`).
100	* `correl`, and `covar` may also be specified for any pair of variable, by adding an
101	underscore followed by the two variable labels, joined by a second underscore
102	(ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables
103	correspond, respectively, to the lines and columns of this matrix.
104	* Exceptions will be raised, for any given variable:
105		- when specifying both `covar` and any combination of (`SE`, `correl`)
106		- when specifying `correl` without `SE`
107
108	**Arguments**
109	- `data`: a CSV-like string
110	- `sep`: the CSV separator
111	- `validate_covar`: whether to check that the overall covariance matrix
112	is symmetric and positive semidefinite. Specifying `validate_covar = False`
113	bypasses this computationally expensive step.
114	
115	**Example**
116	```py
117	import correldata
118	data  = """
119	Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
120	   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
121	   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
122	   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
123	"""[1:-1]
124	print(correldata.read_data(data))
125	
126	# yields:
127	# 
128	# > {
129	#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
130	#     'Tacid': array([90., 90., 90.]),
131	#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
132	#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
133	#   }
134	```
135	'''
136
137	data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')]
138	N = len(data) - 1
139
140	values, se, correl, covar = {}, {}, {}, {}
141	j = 0
142	while j < len(data[0]):
143		field = data[0][j]
144		if not (
145			field.startswith('SE_')
146			or field.startswith('correl_')
147			or field.startswith('covar_')
148			or field == 'SE'
149			or field == 'correl'
150			or field == 'covar'
151			or len(field) == 0
152		):
153			values[field] = _np.array([l[j] for l in data[1:]])
154			j += 1
155			oldfield = field
156		elif field.startswith('SE_'):
157			se[field[3:]] = _np.array([l[j] for l in data[1:]])
158			j += 1
159		elif field == 'SE':
160			se[oldfield] = _np.array([l[j] for l in data[1:]])
161			j += 1
162		elif field.startswith('correl_'):
163			correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]])
164			j += N
165		elif field == 'correl':
166			correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
167			j += N
168		elif field.startswith('covar_'):
169			covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]])
170			j += N
171		elif field == 'covar':
172			covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]])
173			j += N
174
175	nakedvalues = {}
176	for k in [_ for _ in values]:
177		if (
178			k not in se
179			and k not in correl
180			and k not in covar
181		):
182			nakedvalues[k] = values.pop(k)
183
184	for x in values:
185		if x in covar:
186			if x in se:
187				raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".')
188			if x in correl:
189				raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".')
190		if x in correl:
191			if x not in se:
192				raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".')
193
194	for x in correl:
195		if x in values:
196			covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x])
197		else:
198			for x1 in values:
199				for x2 in values:
200					if x == f'{x1}_{x2}':
201						if x1 in se:
202							se1 = se[x1]
203						else:
204							if x1 in covar:
205								se1 = _np.diag(covar[x1])**0.5
206							else:
207								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
208						if x2 in se:
209							se2 = se[x2]
210						else:
211							if x2 in covar:
212								se2 = _np.diag(covar[x2])**0.5
213							else:
214								raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".')
215
216						covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2)
217
218	for x in se:
219		if x in values and x not in correl:
220			covar[x] = _np.diag(se[x]**2)
221
222	for k in [_ for _ in covar]:
223		if k not in values:
224			for j1 in values:
225				for j2 in values:
226					if k == f'{j1}_{j2}':
227						covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T
228
229	X = _np.array([_ for k in values for _ in values[k]])
230	CM = _np.zeros((X.size, X.size))
231	for i, vi in enumerate(values):
232		for j, vj in enumerate(values):
233			if vi == vj:
234				if vi in covar:
235					CM[N*i:N*i+N,N*j:N*j+N] = covar[vi]
236			else:
237				if f'{vi}_{vj}' in covar:
238					CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}']
239
240	if validate_covar and not is_symmetric_positive_semidefinite(CM):
241		raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.')
242
243	corvalues = uarray(_uc.correlated_values(X, CM))
244
245	allvalues = nakedvalues
246
247	for i, x in enumerate(values):
248		allvalues[x] = corvalues[i*N:i*N+N]
249
250	return allvalues

Read correlated data from a CSV-like string.

Column names are interpreted in the following way:

  • In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
  • Columns whose label starts with SE are interpreted as specifying the standard error for the latest preceding data column.
  • Columns whose label starts with correl are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • Columns whose label starts with covar are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix.
  • SE, correl, and covar may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex: SE_foo, correl_bar, covar_baz).
  • correl, and covar may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex: correl_foo_bar, covar_X_Y). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.
  • Exceptions will be raised, for any given variable:
    • when specifying both covar and any combination of (SE, correl)
    • when specifying correl without SE

Arguments

  • data: a CSV-like string
  • sep: the CSV separator
  • validate_covar: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifying validate_covar = False bypasses this computationally expensive step.

Example

import correldata
data  = """
Sample, Tacid,  D47,   SE,         correl,,,  D48, covar,,,          correl_D47_D48
   FOO,   90., .245, .005,      1, 0.5, 0.5, .145,  4e-4, 1e-4, 1e-4, 0.5,   0,   0
   BAR,   90., .246, .005,    0.5,   1, 0.5, .146,  1e-4, 4e-4, 1e-4,   0, 0.5,   0
   BAZ,   90., .247, .005,    0.5, 0.5,   1, .147,  1e-4, 1e-4, 4e-4,   0,   0, 0.5
"""[1:-1]
print(read_data(data))

# yields:
# 
# > {
#     'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
#     'Tacid': array([90., 90., 90.]),
#     'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
#     'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
#   }
def read_data_from_file(filename: str | os.PathLike, **kwargs):
253def read_data_from_file(filename: str | _os.PathLike, **kwargs):
254	'''
255	Read correlated data from a CSV file.
256
257	**Arguments**
258	- `filename`: `str` or path to the file to read from
259	- `kwargs`: passed to correldata.read_data()
260	'''
261	with open(filename) as fid:
262		return read_data(fid.read(), **kwargs)

Read correlated data from a CSV file.

Arguments

  • filename: str or path to the file to read from
  • kwargs: passed to read_data()
def data_string( data: dict, sep: str = ',', float_fmt: str = 'zg', max_correl_precision: int = 9, fields: list = None, align: str = '>', atol: float = 1e-12, rtol: float = 1e-12):
264def data_string(
265	data: dict,
266	sep: str = ',',
267	float_fmt: str = 'zg',
268	max_correl_precision: int = 9,
269	fields: list = None,
270	align: str = '>',
271	atol: float = 1e-12,
272	rtol: float = 1e-12,
273):
274	'''
275	Generate CSV-like string from correlated data
276
277	**Arguments**
278	- `data`: dict of arrays with strings, floats or correlated data
279	- `sep`: the CSV separator
280	- `float_fmt`: formatting string for float values
281	- `max_correl_precision`: number of post-decimal digits for correlation values
282	- `fields`: subset of fields to write; if `None`, write all fields
283	- `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values
284	- `atol`: passed to _np.allclose(),
285	- `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html),
286	'''
287	if fields is None:
288		fields = [_ for _ in data]
289	cols, ufields = [], []
290	for f in fields:
291		if isinstance(data[f], uarray):
292			ufields.append(f)
293			N = data[f].size
294			cols.append([f] + [f'{_.n:{float_fmt}}' for _ in data[f]])
295			cols.append([f'SE_{f}'] + [f'{_.s:{float_fmt}}' for _ in data[f]])
296			CM = _uc.correlation_matrix(data[f])
297			if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
298				for i in range(N):
299					cols.append(['' if i else f'correl_{f}'] + [f'{CM[i,j] if abs(CM[i,j]) > atol else 0:z.{max_correl_precision}f}'.rstrip('0') for j in range(N)])
300
301		else:
302			cols.append([f] + [str(_) for _ in data[f]])
303
304	for i in range(len(ufields)):
305		for j in range(i):
306			CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N,N:]
307			if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol):
308				for k in range(N):
309					cols.append(['' if k else f'correl_{ufields[i]}_{ufields[j]}'] + [f'{CM[k,l] if abs(CM[k,l]) > atol else 0:z.{max_correl_precision}f}'.rstrip('0') for l in range(N)])
310
311	lines = list(map(list, zip(*cols)))
312
313	if align:
314		lengths = [max([len(e) for e in l]) for l in cols]
315		for l in lines:
316			for k,ln in enumerate(lengths):
317				l[k] = f'{l[k]:{align}{ln}s}'
318		return '\n'.join([(sep+' ').join(l) for l in lines])
319
320	return '\n'.join([sep.join(l) for l in lines])

Generate CSV-like string from correlated data

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • sep: the CSV separator
  • float_fmt: formatting string for float values
  • max_correl_precision: number of post-decimal digits for correlation values
  • fields: subset of fields to write; if None, write all fields
  • align: right-align (>), left-align (<), or don't align (empty string) CSV values
  • atol: passed to _np.allclose(),
  • rtol: passed to numpy.allclose(),
def save_data_to_file(data, filename, **kwargs):
324def save_data_to_file(data, filename, **kwargs):
325	'''
326	Write correlated data to a CSV file.
327
328	**Arguments**
329	- `data`: dict of arrays with strings, floats or correlated data
330	- `filename`: `str` or path to the file to read from
331	- `kwargs`: passed to correldata.data_string()
332	'''
333	with open(filename, 'w') as fid:
334		return fid.write(data_string(data, **kwargs))

Write correlated data to a CSV file.

Arguments

  • data: dict of arrays with strings, floats or correlated data
  • filename: str or path to the file to read from
  • kwargs: passed to data_string()