Coverage for C:\src\imod-python\imod\formats\array_io\reading.py: 98%
171 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-08 14:15 +0200
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-08 14:15 +0200
1import collections
2import glob
3import itertools
4import pathlib
6import dask
7import numpy as np
8import xarray as xr
10from imod.util import nested_dict, spatial, time
13def _all_equal(seq, elem):
14 """Raise an error if not all elements of a list are equal"""
15 if not seq.count(seq[0]) == len(seq):
16 raise ValueError(f"All {elem} must be the same, found: {set(seq)}")
19def _check_cellsizes(cellsizes):
20 """
21 Checks if cellsizes match, raises ValueError otherwise
23 Parameters
24 ----------
25 cellsizes : list of tuples
26 tuples may contain:
27 * two floats, dx and dy, for equidistant files
28 * two ndarrays, dx and dy, for nonequidistant files
30 Returns
31 -------
32 None
33 """
34 msg = "Cellsizes of IDFs do not match"
35 if len(cellsizes) == 1:
36 return None
37 try:
38 if not (cellsizes.count(cellsizes[0]) == len(cellsizes)):
39 raise ValueError(msg)
40 except ValueError: # contains ndarrays
41 try:
42 # all ndarrays
43 dx0, dy0 = cellsizes[0]
44 for dx, dy in cellsizes[1:]:
45 if np.allclose(dx0, dx) and np.allclose(dy0, dy):
46 pass
47 else:
48 raise ValueError(msg)
49 except ValueError:
50 # some ndarrays, some floats
51 # create floats for comparison with allclose
52 try:
53 dx = cellsizes[0][0][0]
54 dy = cellsizes[0][1][0]
55 except TypeError:
56 dx = cellsizes[0][0]
57 dy = cellsizes[0][1]
58 # comparison
59 for cellsize in cellsizes:
60 # Unfortunately this allocates by broadcasting dx and dy
61 if not np.allclose(cellsize[0], dx):
62 raise ValueError(msg)
63 if not np.allclose(cellsize[1], dy):
64 raise ValueError(msg)
67def _has_dim(seq):
68 """Check if either 0 or all None are present
70 Returns
71 -------
72 True if no None in seq, False if all None, error otherwise
73 """
74 nones = [x is None for x in seq]
75 if any(nones):
76 if all(nones):
77 return False
78 else:
79 raise ValueError("Either 0 or all None allowed")
80 return True
83def _to_nan(a, nodata):
84 """Change all nodata values in the array to NaN"""
85 # it needs to be NaN for xarray to deal with it properly
86 # no need to store the nodata value if it is always NaN
87 if np.isnan(nodata):
88 return a
89 else:
90 isnodata = np.isclose(a, nodata)
91 a[isnodata] = np.nan
92 return a
95def _array_z_coord(coords, tops, bots, unique_indices):
96 top = np.array(tops)[unique_indices]
97 bot = np.array(bots)[unique_indices]
98 dz = top - bot
99 z = top - 0.5 * dz
100 if top[0] > top[1]: # decreasing coordinate
101 dz *= -1.0
102 if np.allclose(dz, dz[0]):
103 coords["dz"] = dz[0]
104 else:
105 coords["dz"] = ("layer", dz)
106 coords["z"] = ("layer", z)
107 return coords
110def _scalar_z_coord(coords, tops, bots):
111 # They must be all the same to be used, as they cannot be assigned
112 # to layer.
113 top = np.unique(tops)
114 bot = np.unique(bots)
115 if top.size == bot.size == 1:
116 dz = top - bot
117 z = top - 0.5 * dz
118 coords["dz"] = float(dz) # cast from array
119 coords["z"] = float(z)
120 return coords
123def _initialize_groupby(ndims):
124 """
125 This function generates a data structure consisting of defaultdicts, to use
126 for grouping arrays by dimension. The number of dimensions may vary, so the
127 degree of nesting might vary as well.
129 For a single dimension such as layer, it'll look like:
130 d = {1: da1, 2: da2, etc.}
132 For two dimensions, layer and time:
133 d = {"2001-01-01": {1: da1, 2: da3}, "2001-01-02": {1: da3, 2: da4}, etc.}
135 And so on for more dimensions.
137 Defaultdicts are very well suited to this application. The
138 itertools.groupby object does not provide any benefits in this case, it
139 simply provides a generator; its entries have to come presorted. It also
140 does not provide tools for these kind of variably nested groupby's.
142 Pandas.groupby does provide this functionality. However, pandas dataframes
143 do not accept any field value, whereas these dictionaries do. Might be
144 worthwhile to look into, if performance is an issue.
146 Parameters
147 ----------
148 ndims : int
149 Number of dimensions
151 Returns
152 -------
153 groupby : Defaultdicts, ndims - 1 times nested
154 """
155 return nested_dict.initialize_nested_dict(ndims - 1)
158def _ndconcat(arrays, ndim):
159 """
160 Parameters
161 ----------
162 arrays : list of lists, n levels deep.
163 E.g. [[da1, da2], [da3, da4]] for n = 2.
164 (compare with docstring for _initialize_groupby)
165 ndim : int
166 number of dimensions over which to concatenate.
168 Returns
169 -------
170 concatenated : xr.DataArray
171 Input concatenated over n dimensions.
172 """
173 if ndim == 1: # base case
174 return dask.array.stack(arrays, axis=0)
175 else: # recursive case
176 ndim = ndim - 1
177 out = [_ndconcat(arrays_in, ndim) for arrays_in in arrays]
178 return dask.array.stack(out, axis=0)
181def _dims_coordinates(header_coords, bounds, cellsizes, tops, bots, use_cftime):
182 # create coordinates
183 coords = spatial._xycoords(bounds[0], cellsizes[0])
184 dims = ["y", "x"]
185 # Time and layer have to be special cased.
186 # Time due to the multitude of datetimes possible
187 # Layer because layer is required to properly assign top and bot data.
188 haslayer = False
189 hastime = False
190 otherdims = []
191 for dim in list(header_coords.keys()):
192 if dim == "layer":
193 haslayer = True
194 coords["layer"], unique_indices = np.unique(
195 header_coords["layer"], return_index=True
196 )
197 elif dim == "time":
198 hastime = True
199 times, use_cftime = time._convert_datetimes(
200 header_coords["time"], use_cftime
201 )
202 if use_cftime:
203 coords["time"] = xr.CFTimeIndex(np.unique(times))
204 else:
205 coords["time"] = np.unique(times)
206 else:
207 otherdims.append(dim)
208 coords[dim] = np.unique(header_coords[dim])
210 # Ensure right dimension
211 if haslayer:
212 dims.insert(0, "layer")
213 if hastime:
214 dims.insert(0, "time")
215 for dim in otherdims:
216 dims.insert(0, dim)
218 # Deal with voxel idf top and bottom data
219 all_have_z = all((v is not None for v in itertools.chain(tops, bots)))
220 if all_have_z:
221 if haslayer and coords["layer"].size > 1:
222 coords = _array_z_coord(coords, tops, bots, unique_indices)
223 else:
224 coords = _scalar_z_coord(coords, tops, bots)
226 return dims, coords
229def _dask(path, attrs=None, pattern=None, _read=None, header=None):
230 """
231 Read a single IDF file to a dask.array
233 Parameters
234 ----------
235 path : str or Path
236 Path to the IDF file to be read
237 attrs : dict, optional
238 A dict as returned by imod.idf.header, this function is called if not supplied.
239 Used to minimize unneeded filesystem calls.
240 pattern : str, regex pattern, optional
241 If the filenames do match default naming conventions of
242 {name}_{time}_l{layer}, a custom pattern can be defined here either
243 as a string, or as a compiled regular expression pattern. Please refer
244 to the examples in ``imod.idf.open``.
246 Returns
247 -------
248 dask.array
249 A float32 dask.array with shape (nrow, ncol) of the values
250 in the IDF file. On opening all nodata values are changed
251 to NaN in the dask.array.
252 dict
253 A dict with all metadata.
254 """
256 path = pathlib.Path(path)
258 if attrs is None:
259 attrs = header(path, pattern)
260 # If we don't unpack, it seems we run into trouble with the dask array later
261 # on, probably because attrs isn't immutable. This works fine instead.
262 headersize = attrs.pop("headersize")
263 nrow = attrs["nrow"]
264 ncol = attrs["ncol"]
265 dtype = attrs["dtype"]
266 # In case of floating point data, nodata is always represented by nan.
267 if "float" in dtype:
268 nodata = attrs.pop("nodata")
269 else:
270 nodata = attrs["nodata"]
272 # Dask delayed caches the input arguments. If the working directory changes
273 # before .compute(), the file cannot be found if the path is relative.
274 abspath = path.resolve()
275 # dask.delayed requires currying
276 a = dask.delayed(_read)(abspath, headersize, nrow, ncol, nodata, dtype)
277 x = dask.array.from_delayed(a, shape=(nrow, ncol), dtype=dtype)
278 return x, attrs
281def _load(paths, use_cftime, _read, headers):
282 """Combine a list of paths to IDFs to a single xarray.DataArray"""
283 # this function also works for single IDFs
284 names = [h["name"] for h in headers]
285 _all_equal(names, "names")
287 # Extract data from headers
288 bounds = []
289 cellsizes = []
290 tops = []
291 bots = []
292 header_coords = collections.defaultdict(list)
293 for h in headers:
294 bounds.append((h["xmin"], h["xmax"], h["ymin"], h["ymax"]))
295 cellsizes.append((h["dx"], h["dy"]))
296 tops.append(h.get("top", None))
297 bots.append(h.get("bot", None))
298 for key in h["dims"]:
299 header_coords[key].append(h[key])
300 # Do a basic check whether IDFs align in x and y
301 _all_equal(bounds, "bounding boxes")
302 _check_cellsizes(cellsizes)
303 # Generate coordinates
304 dims, coords = _dims_coordinates(
305 header_coords, bounds, cellsizes, tops, bots, use_cftime
306 )
307 # This part have makes use of recursion to deal with an arbitrary number
308 # of dimensions. It may therefore be a little hard to read.
309 groupbydims = dims[:-2] # skip y and x
310 ndim = len(groupbydims)
311 groupby = _initialize_groupby(ndim)
312 if ndim == 0: # Single idf
313 dask_array, _ = _dask(paths[0], headers[0], _read=_read)
314 else:
315 for path, attrs in zip(paths, headers):
316 da, _ = _dask(path, attrs=attrs, _read=_read)
317 groupbykeys = [attrs[k] for k in groupbydims]
318 nested_dict.set_nested(groupby, groupbykeys, da)
319 dask_arrays = nested_dict.sorted_nested_dict(groupby)
320 dask_array = _ndconcat(dask_arrays, ndim)
322 out = xr.DataArray(dask_array, coords, dims, name=names[0])
324 first_attrs = headers[0]
326 if "crs" in first_attrs:
327 out.attrs["crs"] = first_attrs["crs"]
328 if "nodata" in first_attrs:
329 out.attrs["nodata"] = first_attrs["nodata"]
331 return out
334def _open(path, use_cftime, pattern, header, _read):
335 if isinstance(path, pathlib.Path):
336 path = str(path)
338 if isinstance(path, list):
339 paths = path
340 else:
341 paths = [pathlib.Path(p) for p in glob.glob(path)]
343 headers = [header(p, pattern) for p in paths]
344 n = len(paths)
345 if n == 0:
346 raise FileNotFoundError(f"Could not find any files matching {path}")
347 return _load(paths, use_cftime, _read, headers)