Coverage for C:\src\imod-python\imod\prepare\spatial.py: 92%
361 statements
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-08 14:15 +0200
« prev ^ index » next coverage.py v7.5.1, created at 2024-05-08 14:15 +0200
1import pathlib
2from typing import TYPE_CHECKING, Callable, Union
4import dask
5import numba
6import numpy as np
7import pandas as pd
8import scipy.ndimage
9import xarray as xr
10from numpy.typing import DTypeLike
12import imod
13from imod.prepare import common, pcg
14from imod.util.imports import MissingOptionalModule
15from imod.util.spatial import _polygonize
17# since rasterio is a big dependency that is sometimes hard to install
18# and not always required, we made this an optional dependency
19try:
20 import rasterio
21 import rasterio.features
22 import rasterio.warp
23except ImportError:
24 rasterio = MissingOptionalModule("rasterio")
26if TYPE_CHECKING:
27 import geopandas as gpd
30def round_extent(extent, cellsize):
31 """Increases the extent until all sides lie on a coordinate
32 divisible by cellsize."""
33 xmin, ymin, xmax, ymax = extent
34 xmin = np.floor(xmin / cellsize) * cellsize
35 ymin = np.floor(ymin / cellsize) * cellsize
36 xmax = np.ceil(xmax / cellsize) * cellsize
37 ymax = np.ceil(ymax / cellsize) * cellsize
38 return xmin, ymin, xmax, ymax
41def round_z(z_extent, dz):
42 """Increases the extent until all sides lie on a coordinate
43 divisible by dz."""
44 zmin, zmax = z_extent
45 zmin = np.floor(zmin / dz) * dz
46 zmax = np.ceil(zmax / dz) * dz
47 return zmin, zmax
50def _fill_np(data, invalid):
51 """Basic nearest neighbour interpolation"""
52 # see: https://stackoverflow.com/questions/5551286/filling-gaps-in-a-numpy-array
53 ind = scipy.ndimage.distance_transform_edt(
54 invalid, return_distances=False, return_indices=True
55 )
56 return data[tuple(ind)]
59def fill(da, invalid=None, by=None):
60 """
61 Replace the value of invalid ``da`` cells (indicated by ``invalid``)
62 using basic nearest neighbour interpolation.
64 Parameters
65 ----------
66 da: xr.DataArray with gaps
67 array containing missing value
68 by: str, optional
69 dimension over which the array will be filled, one by one.
70 See the examples.
72 invalid: xr.DataArray
73 a binary array of same shape as ``da``.
74 data value are replaced where invalid is True
75 If None (default), uses: `invalid = np.isnan(data)`
77 Returns
78 -------
79 xarray.DataArray
80 with the same coordinates as the input.
82 Examples
83 --------
85 A common use case is filling holes in a DataArray, filling it with the
86 value of its nearest (valid) neighbor:
88 >>> filled = imod.prepare.fill(da)
90 In case of a tie (e.g. neighbors in x and y are both one cell removed), the
91 neighbor in the last dimension is chosen (for rasters, that's generally x).
93 A typical use case is filling a 3D array (layer, y, x), but only in the
94 horizontal dimensions. The ``by`` keyword can be used to do this:
96 >>> filled = imod.prepare.fill(da, by="layer")
98 In this case, the array is filled by one layer at a time.
99 """
101 out = xr.full_like(da, np.nan)
102 if invalid is None:
103 invalid = np.isnan(da)
104 if by:
105 for coordvalue in da[by]:
106 d = {by: coordvalue}
107 out.sel(d)[...] = _fill_np(da.sel(d).values, invalid.sel(d).values)
108 else:
109 out.values = _fill_np(da.values, invalid.values)
111 return out
114def laplace_interpolate(
115 source, ibound=None, close=0.01, mxiter=5, iter1=50, relax=0.98
116):
117 """
118 Fills gaps in `source` by interpolating from existing values using Laplace
119 interpolation.
121 Parameters
122 ----------
123 source : xr.DataArray of floats with dims (y, x)
124 Data values to interpolate.
125 ibound : xr.DataArray of bool with dims (y, x)
126 Precomputed array which marks where to interpolate.
127 close : float
128 Closure criteration of iterative solver. Should be one to two orders
129 of magnitude smaller than desired accuracy.
130 mxiter : int
131 Outer iterations of iterative solver.
132 iter1 : int
133 Inner iterations of iterative solver. Should not exceed 50.
134 relax : float
135 Iterative solver relaxation parameter. Should be between 0 and 1.
137 Returns
138 -------
139 interpolated : xr.DataArray with dims (y, x)
140 source, with interpolated values where ibound equals 1
141 """
142 solver = pcg.PreconditionedConjugateGradientSolver(
143 close, close * 1.0e6, mxiter, iter1, relax
144 )
146 if not source.dims == ("y", "x"):
147 raise ValueError('source dims must be ("y", "x")')
149 # expand dims to make 3d
150 source3d = source.expand_dims("layer")
151 if ibound is None:
152 iboundv = xr.full_like(source3d, 1, dtype=np.int32).values
153 else:
154 if not ibound.dims == ("y", "x"):
155 raise ValueError('ibound dims must be ("y", "x")')
156 if not ibound.shape == source.shape:
157 raise ValueError("ibound and source must have the same shape")
158 iboundv = ibound.expand_dims("layer").astype(np.int32).values
160 has_data = source3d.notnull().values
161 iboundv[has_data] = -1
162 hnew = source3d.fillna(0.0).values.astype(
163 np.float64
164 ) # Set start interpolated estimate to 0.0
166 shape = iboundv.shape
167 nlay, nrow, ncol = shape
168 nodes = nlay * nrow * ncol
169 # Allocate work arrays
170 # Not really used now, but might come in handy to implements weights
171 cc = np.ones(shape)
172 cr = np.ones(shape)
173 cv = np.ones(shape)
174 rhs = np.zeros(shape)
175 hcof = np.zeros(shape)
176 # Solver work arrays
177 res = np.zeros(nodes)
178 cd = np.zeros(nodes)
179 v = np.zeros(nodes)
180 ss = np.zeros(nodes)
181 p = np.zeros(nodes)
183 # Picard iteration
184 converged = False
185 outer_iteration = 0
186 while not converged and outer_iteration < mxiter:
187 # Mutates hnew
188 converged = solver.solve(
189 hnew=hnew,
190 cc=cc,
191 cr=cr,
192 cv=cv,
193 ibound=iboundv,
194 rhs=rhs,
195 hcof=hcof,
196 res=res,
197 cd=cd,
198 v=v,
199 ss=ss,
200 p=p,
201 )
202 outer_iteration += 1
203 else:
204 if not converged:
205 raise RuntimeError("Failed to converge")
207 hnew[iboundv == 0] = np.nan
208 return source.copy(data=hnew[0])
211def rasterize(geodataframe, like, column=None, fill=np.nan, **kwargs):
212 """
213 Rasterize a geopandas GeoDataFrame onto the given
214 xarray coordinates.
216 Parameters
217 ----------
218 geodataframe : geopandas.GeoDataFrame
219 column : str, int, float
220 column name of geodataframe to burn into raster
221 like : xarray.DataArray
222 Example DataArray. The rasterized result will match the shape and
223 coordinates of this DataArray.
224 fill : float, int
225 Fill value for nodata areas. Optional, default value is np.nan.
226 kwargs : additional keyword arguments for rasterio.features.rasterize.
227 See: https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize
229 Returns
230 -------
231 rasterized : xarray.DataArray
232 Vector data rasterized. Matches shape and coordinates of ``like``.
233 """
235 if column is not None:
236 shapes = list(zip(geodataframe.geometry, geodataframe[column]))
237 else:
238 shapes = list(geodataframe.geometry)
240 # shapes must be an iterable
241 try:
242 iter(shapes)
243 except TypeError:
244 shapes = (shapes,)
246 raster = rasterio.features.rasterize(
247 shapes,
248 out_shape=like.shape,
249 fill=fill,
250 transform=imod.util.spatial.transform(like),
251 **kwargs,
252 )
254 return xr.DataArray(raster, like.coords, like.dims)
257def polygonize(da: xr.DataArray) -> "gpd.GeoDataFrame":
258 """
259 Polygonize a 2D-DataArray into a GeoDataFrame of polygons.
261 Parameters
262 ----------
263 da : xr.DataArray
265 Returns
266 -------
267 polygonized : geopandas.GeoDataFrame
268 """
269 return _polygonize(da)
272def _handle_dtype(dtype, nodata):
273 # Largely taken from rasterio.dtypes
274 # https://github.com/mapbox/rasterio/blob/master/rasterio/dtypes.py
275 # Not supported:
276 # GDT_CInt16 = 8, GDT_CInt32 = 9, GDT_CFloat32 = 10, GDT_CFloat64 = 11
277 dtype_mapping = {
278 "uint8": 1, # GDT_Byte
279 "uint16": 2, # GDT_Uint16
280 "int16": 3, # GDT_Int16
281 "uint32": 4, # GDT_Uint32
282 "int32": 5, # GDT_Int32
283 "float32": 6, # GDT_Float32
284 "float64": 7, # GDT_Float64
285 }
286 dtype_ranges = {
287 "uint8": (0, 255),
288 "uint16": (0, 65535),
289 "int16": (-32768, 32767),
290 "uint32": (0, 4294967295),
291 "int32": (-2147483648, 2147483647),
292 "float32": (-3.4028235e38, 3.4028235e38),
293 "float64": (-1.7976931348623157e308, 1.7976931348623157e308),
294 }
296 def format_invalid(str_dtype):
297 str_dtypes = dtype_mapping.keys()
298 return "Invalid dtype: {0}, must be one of: {1}".format(
299 str_dtype, ", ".join(str_dtypes)
300 )
302 if dtype is np.dtype(np.int64):
303 dtype = np.int32
305 str_dtype = str(np.dtype(dtype))
306 if str_dtype not in dtype_mapping.keys():
307 raise ValueError(format_invalid(str_dtype))
308 gdal_dtype = dtype_mapping[str_dtype]
310 if nodata is None:
311 if np.issubdtype(dtype, np.integer):
312 # Default to lowest value in case of integers
313 nodata = dtype_ranges[str_dtype][0]
314 elif np.issubdtype(dtype, np.floating):
315 # Default to NaN in case of floats
316 nodata = np.nan
317 else:
318 lower, upper = dtype_ranges[str_dtype]
319 if nodata < lower or nodata > upper:
320 raise ValueError(f"Nodata value {nodata} is out of bounds for {str_dtype}")
322 return gdal_dtype, nodata
325def gdal_rasterize(
326 path,
327 column,
328 like=None,
329 nodata=None,
330 dtype=None,
331 spatial_reference=None,
332 all_touched=False,
333):
334 """
335 Use GDAL to rasterize a vector file into an xarray.DataArray.
337 Can be significantly more efficient than rasterize. This doesn't load the
338 vector data into a GeoDataFrame and loops over the individual shapely
339 geometries like rasterio.rasterize does, but loops over the features within
340 GDAL instead.
342 Parameters
343 ----------
344 path : str or pathlib.Path
345 path to OGR supported vector file (e.g. a shapefile)
346 column : str
347 column name of column to burn into raster
348 like : xr.DataArray, optional
349 example of raster
350 nodata : int, float; optional
351 dtype : numpy.dtype, optional
352 spatial_reference : dict, optional
353 Optional dict to avoid allocating the like DataArray. Used if template
354 is None. Dict has keys "bounds" and "cellsizes", with:
356 * bounds = (xmin, xmax, ymin, ymax)
357 * cellsizes = (dx, dy)
358 all_touched : bool
359 If True: all pixels touched by lines or polygons will be updated, not
360 just those on the line render path, or whose center point is within the
361 polygon. Default value is False.
363 Returns
364 -------
365 rasterized : xr.DataArray
366 """
367 from osgeo import gdal, ogr
369 if isinstance(path, pathlib.Path):
370 p = path
371 path = str(p)
372 else:
373 p = pathlib.Path(path)
374 if not p.exists():
375 raise FileNotFoundError(f"No such file: {path}")
377 if dtype is None:
378 if like is None:
379 raise ValueError("If ``like`` is not provided, ``dtype`` has to be given")
380 else:
381 dtype = like.dtype
382 gdal_dtype, nodata = _handle_dtype(dtype, nodata)
384 # Exceptions will get raised on anything >= gdal.CE_Failure
385 gdal.UseExceptions()
387 # An attempt at decent errors
388 class GdalErrorHandler:
389 def __init__(self):
390 self.err_level = gdal.CE_None
391 self.err_no = 0
392 self.err_msg = ""
394 def handler(self, err_level, err_no, err_msg):
395 self.err_level = err_level
396 self.err_no = err_no
397 self.err_msg = err_msg
399 error = GdalErrorHandler()
400 handler = error.handler
401 gdal.PushErrorHandler(handler)
403 # Get spatial data from template
404 if like is not None:
405 dx, xmin, _, dy, _, ymax = imod.util.spatial.spatial_reference(like)
406 nrow, ncol = like.shape
407 else:
408 cellsizes = spatial_reference["cellsizes"]
409 bounds = spatial_reference["bounds"]
410 dx, dy = cellsizes
411 if not isinstance(dx, (int, float)) or not isinstance(dy, (int, float)):
412 raise ValueError("Cannot rasterize to a non-equidistant grid")
413 coords = imod.util.spatial._xycoords(bounds, cellsizes)
414 xmin, _, _, ymax = bounds
415 nrow = coords["y"].size
416 ncol = coords["x"].size
417 dims = ("y", "x")
419 # File will be closed when vector is dereferenced, after return
420 vector = ogr.Open(path)
421 vector_layer = vector.GetLayer()
423 memory_driver = gdal.GetDriverByName("MEM")
424 memory_raster = memory_driver.Create("", ncol, nrow, 1, gdal_dtype)
425 memory_raster.SetGeoTransform([xmin, dx, 0, ymax, 0, dy])
426 memory_band = memory_raster.GetRasterBand(1)
427 memory_band.SetNoDataValue(nodata)
428 memory_band.Fill(nodata)
430 options = [f"ATTRIBUTE={column}", f"ALL_TOUCHED={str(all_touched).upper()}"]
431 gdal.RasterizeLayer(memory_raster, [1], vector_layer, None, None, [1], options)
432 if error.err_level >= gdal.CE_Warning:
433 message = error.err_msg
434 if message.startswith(
435 "Failed to fetch spatial reference on layer"
436 ) and message.endswith("assuming matching coordinate systems."):
437 pass
438 else:
439 raise RuntimeError("GDAL error: " + error.err_msg)
441 if like is not None:
442 rasterized = like.copy(data=memory_raster.ReadAsArray())
443 else:
444 rasterized = xr.DataArray(memory_raster.ReadAsArray(), coords, dims)
446 return rasterized
449@numba.njit(cache=True)
450def _cell_count(src, values, frequencies, nodata, *inds_weights):
451 """
452 numba compiled function to count the number of src cells occuring in the dst
453 cells.
455 Parameters
456 ----------
457 src : np.array
458 values : np.array
459 work array to store the unique values
460 frequencies : np.array
461 work array to store the tallied counts
462 nodata : int, float
463 inds_weights : tuple of np.arrays
464 Contains indices of dst, indices of src, and weights.
466 Returns
467 -------
468 tuple of np.arrays
470 * row_indices
471 * col_indices
472 * values
473 * frequencies
474 """
475 jj, blocks_iy, blocks_weights_y, kk, blocks_ix, blocks_weights_x = inds_weights
477 # Use list for dynamic allocation, since we don't know number of rows in
478 # advance.
479 row_indices = []
480 col_indices = []
481 value_list = []
482 count_list = []
484 # j, k are indices of dst array
485 # block_i contains indices of src array
486 # block_w contains weights of src array
487 for countj, j in enumerate(jj):
488 block_iy = blocks_iy[countj]
489 block_wy = blocks_weights_y[countj]
490 for countk, k in enumerate(kk):
491 block_ix = blocks_ix[countk]
492 block_wx = blocks_weights_x[countk]
494 # TODO: use weights in frequency count, and area sum?
495 # Since src is equidistant, normed weights are easy to calculate.
497 # Add the values and weights per cell in multi-dim block
498 value_count = 0
499 for iy, wy in zip(block_iy, block_wy):
500 if iy < 0:
501 break
502 for ix, wx in zip(block_ix, block_wx):
503 if ix < 0:
504 break
506 v = src[iy, ix]
507 if v == nodata: # Skip nodata cells
508 continue
509 # Work on a single destination cell
510 # Count the number of polygon id's occuring in the cell
511 # a new row per id
512 found = False
513 for i in range(value_count):
514 if v == values[i]:
515 frequencies[i] += 1
516 found = True
517 break
518 if not found:
519 values[value_count] = v
520 frequencies[value_count] = 1
521 value_count += 1
522 # Add a new entry
523 row_indices.append(j)
524 col_indices.append(k)
526 # Store for output
527 value_list.extend(values[:value_count])
528 count_list.extend(frequencies[:value_count])
530 # reset storage
531 values[:value_count] = 0
532 frequencies[:value_count] = 0
534 # Cast to numpy arrays
535 row_i_arr = np.array(row_indices)
536 col_i_arr = np.array(col_indices)
537 value_arr = np.array(value_list)
538 count_arr = np.array(count_list)
540 return row_i_arr, col_i_arr, value_arr, count_arr
543def _celltable(
544 path: str | pathlib.Path,
545 column: str,
546 resolution: int,
547 like: xr.DataArray,
548 dtype: DTypeLike,
549 rowstart: int = 0,
550 colstart: int = 0,
551) -> pd.DataFrame:
552 """
553 Returns a table of cell indices (row, column) with feature ID, and feature
554 area within cell. Essentially returns a COO sparse matrix, but with
555 duplicate values per cell, since more than one geometry may be present.
557 The feature area within the cell is approximated by first rasterizing the
558 feature, and then counting the number of occuring cells. This means the
559 accuracy of the area depends on the resolution of the rasterization step.
561 Parameters
562 ----------
563 path : str or pathlib.Path
564 path to OGR supported vector file (e.g. a shapefile)
565 column : str
566 column name of column to burn into raster
567 resolution : float
568 cellsize at which the rasterization, and determination of area within
569 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m).
570 like : xarray.DataArray
571 Example DataArray of where the cells will be located. Used only for the
572 coordinates.
573 dtype: numpy.dtype
574 datatype of data referred to with "column".
576 Returns
577 -------
578 cell_table : pandas.DataFrame
579 """
580 # Avoid side-effects
581 like = like.copy(deep=False)
582 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
583 dx = resolution
584 dy = -dx
585 nodata = -1
586 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}
588 rasterized = gdal_rasterize(
589 path, column, nodata=nodata, dtype=dtype, spatial_reference=spatial_reference
590 )
592 # Make sure the coordinates are increasing.
593 dims = ("y", "x")
594 rasterized, _ = common._increasing_dims(rasterized, dims)
595 like, flip_dst = common._increasing_dims(like, dims)
597 dst_coords = [imod.prepare.common._coord(like, dim) for dim in ("y", "x")]
598 src_coords = [imod.prepare.common._coord(rasterized, dim) for dim in ("y", "x")]
599 # Determine weights for every regrid dimension, and alloc_len,
600 # the maximum number of src cells that may end up in a single dst cell
601 inds_weights = []
602 alloc_len = 1
603 for src_x, dst_x in zip(src_coords, dst_coords):
604 size, i_w = imod.prepare.common._weights_1d(src_x, dst_x)
605 for elem in i_w:
606 inds_weights.append(elem)
607 alloc_len *= size
609 # Pre-allocate work arrays
610 values = np.full(alloc_len, 0, dtype=dtype)
611 frequencies = np.full(alloc_len, 0, dtype=dtype)
612 rows, cols, values, counts = _cell_count(
613 rasterized.values, values, frequencies, nodata, *inds_weights
614 )
616 if "y" in flip_dst:
617 rows = (like["y"].size - 1) - rows
618 if "x" in flip_dst:
619 cols = (like["x"].size - 1) - cols
621 df = pd.DataFrame()
622 df["row_index"] = rows + rowstart
623 df["col_index"] = cols + colstart
624 df[column] = values
625 df["area"] = counts * (dx * dx)
627 return df
630def _create_chunks(like, resolution, chunksize):
631 """
632 Cuts data into chunksize by chunksize.
634 Parameters
635 ----------
636 like : xarray.DataArray
637 resolution : float
638 chunksize : int
640 Returns
641 -------
642 chunks : list of xr.DataArray
643 """
645 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
646 # Compute how many rows and columns are necessary for fine resolution
647 nrow = int((ymax - ymin) / resolution)
648 ncol = int((xmax - xmin) / resolution)
649 # Find out where to cut
650 x_starts = np.arange(0, ncol, chunksize) * resolution + xmin
651 y_starts = np.arange(0, nrow, chunksize) * resolution + ymin
652 # Searchsorted assumes the arrays are pre-sorted.
653 x = np.sort(like.coords["x"].values)
654 y = np.sort(like.coords["y"].values)
655 # Get the matching indices of like.
656 ix_starts = list(np.searchsorted(x, x_starts))
657 iy_starts = list(np.searchsorted(y, y_starts))
658 # Append None. In python's slice object, None denotes "slice including
659 # first/last element"
660 ix_ends = ix_starts[1:] + [None]
661 iy_ends = iy_starts[1:] + [None]
662 # Use xarray to grab the chunks. The chunks have x and y coordinates.
663 # These will inform GDAL on which part to rasterize.
664 # GDAL will only rasterize within the boundaries of the chunks, so there's
665 # no need to clip the shapefile beforehand.
666 chunks = []
667 rowstarts = []
668 colstarts = []
669 for j0, j1 in zip(iy_starts, iy_ends):
670 for i0, i1 in zip(ix_starts, ix_ends):
671 chunks.append(like.isel(y=slice(j0, j1), x=slice(i0, i1)))
672 rowstarts.append(j0)
673 colstarts.append(i0)
674 return chunks, rowstarts, colstarts
677def celltable(
678 path: str | pathlib.Path,
679 column: str,
680 resolution: int,
681 like: xr.DataArray,
682 dtype: DTypeLike = np.int32,
683 chunksize: int = 10_000,
684) -> pd.DataFrame:
685 r"""
686 Process area of features by rasterizing in a chunkwise manner to limit
687 memory usage.
689 Returns a table of cell indices (row, column) with for example feature ID,
690 and feature area within cell. Essentially returns a COO sparse matrix, but
691 with duplicate values per cell, since more than one geometry may be present.
693 The feature area within the cell is approximated by first rasterizing the
694 feature, and then counting the number of occuring cells. This means the
695 accuracy of the area depends on the cellsize of the rasterization step.
697 A celltable is returned, as a ``pandas.DataFrame``. It has the following
698 columns:
700 1. ``"row_index"``
701 2. ``"col_index"``
702 3. the value of the ``column`` argument
703 4. ``"area"``
705 ``"row_index"`` and ``"col_index"`` are the indices of the like array in
706 which the polygon is located. The ``column`` value holds the rasterized
707 value of the specified column. ``"area"`` contains the area of the
708 polygon within the cell.
710 The most convenient way of using this celltable is by specifying a feature
711 ID as ``column``. After creating a celltable, ``pandas.DataFrame.merge()``
712 can be used to join additional data on this ID. Refer to the examples.
714 Parameters
715 ----------
716 path : str or pathlib.Path
717 path to OGR supported vector file (e.g. a shapefile)
718 column : str
719 column name of column to burn into raster
720 resolution : float
721 cellsize at which the rasterization, and determination of area within
722 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m).
723 like : xarray.DataArray
724 Example DataArray of where the cells will be located. Used only for the
725 coordinates.
726 dtype: DtypeLike, optional
727 datatype of data referred to with "column", defaults to 32-bit integer.
728 chunksize : int, optional
729 The size of the chunksize. Used for both x and y dimension.
731 Returns
732 -------
733 celltable : pandas.DataFrame
735 Examples
736 --------
737 Assume we have a shapefile called ``waterways.shp`` and information on the
738 model discretization is described by a ``like`` DataArray. The feature ID is
739 provided by a column in the shapefile called "ID-code". Additionally, this
740 shapefile also specifies bed hydraulic resistance (c0). For this specific
741 discretization, we wish to calculate a conductance (area divided by
742 hydraulic resistance). To do so, we:
744 1. create a ``celltable``
745 2. join the additional attributes (such as c0)
746 3. compute the conductance per feature
747 4. sum conductances per cell
749 Import the required packages.
751 >>> import imod
752 >>> import geopandas as gpd
754 Generate the celltable.
756 >>> celltable = imod.prepare.celltable(
757 path="waterways.shp",
758 column="ID-code",
759 resolution=0.5,
760 like=like,
761 )
763 Load the shapefile with geopandas into a ``GeoDataFrame``.
765 >>> gdf = gpd.read_file("waterways.shp)
767 Select the relevant columns into a ``pandas.DataFrame`` and merge with the
768 celltable.
770 >>> df = gdf[["ID-code", "c0"]]
771 >>> joined = celltable.merge(gdf, on="ID-code")
773 We compute the conductance, and sum it per cell using ``pandas`` methods:
775 >>> joined["conductance"] = joined["area"] / joined["c0"]
776 >>> summed_conductance = joined.groupby(["row_index", "col_index"], as_index=False)[
777 "conductance"
778 ].sum()
780 Finally, turn the result into a DataArray so it can be used as model input:
782 >>> conductance = imod.prepare.rasterize_celltable(
783 table=summed_conductance,
784 column="conductance",
785 like=like,
786 )
788 """
789 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like)
790 if not imod.util.spatial.is_divisor(dx, resolution):
791 raise ValueError("resolution is not an (integer) divisor of dx")
792 if not imod.util.spatial.is_divisor(dy, resolution):
793 raise ValueError("resolution is not an (integer) divisor of dy")
795 like_chunks, rowstarts, colstarts = _create_chunks(like, resolution, chunksize)
796 collection = [
797 dask.delayed(_celltable)(
798 path, column, resolution, chunk, dtype, rowstart, colstart
799 )
800 for chunk, rowstart, colstart in zip(like_chunks, rowstarts, colstarts)
801 ]
802 result = dask.compute(collection)[0]
803 return pd.concat(result)
806@numba.njit
807def _burn_cells(raster, rows, cols, values):
808 """
809 Burn values of sparse COO-matrix into raster.
810 rows, cols, and values form a sparse matrix in coordinate format (COO)
811 (also known as "ijv" or "triplet" format).
813 Parameters
814 ----------
815 raster : np.array
816 raster to burn values into.
817 rows : np.array of integers
818 row indices (i)
819 cols : np.array of integers
820 column indices (j)
821 values : np.array of floats
822 values to burn (v)
823 """
824 for i, j, v in zip(rows, cols, values):
825 raster[i, j] = v
826 return raster
829def rasterize_celltable(table, column, like):
830 """
831 Rasterizes a table, such as produced by ``imod.prepare.spatial.celltable``.
832 Before rasterization, multiple values should be grouped and aggregated per
833 cell. Values will be overwritten otherwise.
835 Parameters
836 ----------
837 like : xr.DataArray
838 table : pandas.DataFrame
839 with columns: "row_index", "col_index"
840 column : str, int, float
841 column name of values to rasterize
843 Returns
844 -------
845 rasterized : xr.DataArray
846 """
847 rows = table["row_index"].values
848 cols = table["col_index"].values
849 area = table[column].values
850 dst = like.copy()
851 dst.values = _burn_cells(dst.values, rows, cols, area)
852 return dst
855def _zonal_aggregate_raster(
856 path: Union[str, pathlib.Path],
857 column: str,
858 resolution: float,
859 raster: xr.DataArray,
860 method: Union[str, Callable],
861) -> pd.DataFrame:
862 """
863 * Rasterize the polygon at given `resolution`
864 * Sample the raster at the rasterized polygon cells
865 * Store both in a dataframe, groupby and aggregate according to `method`
866 """
867 data_dx, xmin, xmax, data_dy, ymin, ymax = imod.util.spatial.spatial_reference(
868 raster
869 )
870 dx = resolution
871 dy = -dx
872 nodata = -1
873 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}
875 rasterized = gdal_rasterize(
876 path,
877 column,
878 nodata=nodata,
879 dtype=np.int32,
880 spatial_reference=spatial_reference,
881 all_touched=True,
882 )
883 ravelled = rasterized.values.ravel()
885 y = np.arange(ymax + 0.5 * dy, ymin, dy)
886 x = np.arange(xmin + 0.5 * dx, xmax, dx)
887 is_data = ravelled != nodata
888 feature_id = ravelled[is_data]
889 yy, xx = [a.ravel()[is_data] for a in np.meshgrid(y, x, indexing="ij")]
891 dims = ("y", "x")
892 rasterized, _ = common._increasing_dims(rasterized, dims)
893 raster, _ = common._increasing_dims(raster, dims)
894 y_ind = ((yy - ymin) / abs(data_dy)).astype(int)
895 x_ind = ((xx - xmin) / abs(data_dx)).astype(int)
896 sample = raster.values[y_ind, x_ind]
898 df = pd.DataFrame({column: feature_id, "data": sample})
899 # Remove entries where the raster has nodata.
900 # This may result in areas significantly smaller than the polygon geometry,
901 # but should come in handy for weighting later?
902 df = df[df["data"].notnull()]
903 result = df.groupby(column, as_index=False).agg(["count", method])
904 # Reset index to set "column" as column again, make sure index is dropped
905 result = result.reset_index(drop=True)
906 # Compute the area from the counted number of cells
907 result["data", "count"] *= resolution * resolution
908 name = raster.name if raster.name else "aggregated"
909 result.columns = [column, "area", name]
910 return result
913def _zonal_aggregate_polygons(
914 path_a: Union[str, pathlib.Path],
915 path_b: Union[str, pathlib.Path],
916 column_a: str,
917 column_b: str,
918 resolution: float,
919 like: xr.DataArray,
920 method: Union[str, Callable],
921) -> pd.DataFrame:
922 """
923 * Rasterize a, rasterize b for the same domain
924 * Store both in a dataframe, groupby and aggregate according to `method`
925 """
926 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like)
927 dx = resolution
928 dy = -dx
929 nodata = -1
930 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)}
932 rasterized_a = gdal_rasterize(
933 path_a,
934 column_a,
935 nodata=nodata,
936 dtype=np.int32,
937 spatial_reference=spatial_reference,
938 all_touched=True,
939 )
940 rasterized_b = gdal_rasterize(
941 path_b,
942 column_b,
943 nodata=np.nan,
944 dtype=np.float64,
945 spatial_reference=spatial_reference,
946 all_touched=True,
947 )
948 is_data = ((rasterized_a != nodata) & (rasterized_b.notnull())).values
949 a = rasterized_a.values[is_data].ravel()
950 b = rasterized_b.values[is_data].ravel()
951 df = pd.DataFrame({column_a: a, column_b: b})
952 # Remove entries where the raster has nodata.
953 # This may result in areas significantly smaller than the polygon geometry,
954 # but should come in handy for weighting later?
955 result = df.groupby(column_a, as_index=False).agg(["count", method])
956 # Reset index to set "column_a" as column again, make sure index is dropped
957 result = result.reset_index(drop=True)
958 # Compute the area from the counted number of cells
959 result[column_b, "count"] *= resolution * resolution
960 result.columns = [column_a, "area", column_b]
961 return result
964def zonal_aggregate_raster(
965 path: Union[pathlib.Path, str],
966 column: str,
967 raster: xr.DataArray,
968 resolution: float,
969 method: Union[str, Callable],
970 chunksize: int = 10_000,
971) -> pd.DataFrame:
972 """
973 Compute a zonal aggregate of raster data for polygon geometries, e.g. a mean,
974 mode, or percentile.
976 Parameters
977 ----------
978 path : str or pathlib.Path
979 path to OGR supported vector file (e.g. a shapefile). Defines zones
980 of aggregation.
981 column : str
982 column name of path, integer IDs defining zones.
983 raster : xarray.DataArray
984 Raster data from which to sample and aggregate data
985 resolution : float
986 cellsize at which the rasterization of polygons and sampling occurs
987 method : Union[str, Callable]
988 Aggregation method.
989 Anything that is acceptable by a pandas groupby aggregate:
990 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html
991 chunksize : int, optional
992 The size of the chunksize. Used for both x and y dimension.
994 Returns
995 -------
996 zonal_aggregates : pandas.DataFrame
998 Examples
999 --------
1001 To compute the mean surface level at polygons of water bodies:
1003 >>> import imod
1004 >>> surface_level = imod.rasterio.open("surface_level.tif")
1005 >>> df = imod.prepare.spatial.zonal_aggregate_raster(
1006 >>> path="water-bodies.shp",
1007 >>> column="id",
1008 >>> raster=surface_level,
1009 >>> resolution=1.0,
1010 >>> method="mean",
1011 >>> )
1013 For some functions, like the mode, a function should be passed instead:
1015 >>> import pandas as pd
1016 >>> df = imod.prepare.spatial.zonal_aggregate_raster(
1017 >>> path="water-bodies.shp",
1018 >>> column="id",
1019 >>> raster=surface_level,
1020 >>> resolution=1.0,
1021 >>> method=pd.Series.mode,
1022 >>> )
1023 """
1024 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(raster)
1025 if not imod.util.spatial.is_divisor(dx, resolution):
1026 raise ValueError("resolution is not an (integer) divisor of dx")
1027 if not imod.util.spatial.is_divisor(dy, resolution):
1028 raise ValueError("resolution is not an (integer) divisor of dy")
1030 without_chunks = (raster.chunks is None) or (
1031 all(length == 1 for length in map(len, raster.chunks))
1032 )
1033 if without_chunks:
1034 raster = raster.compute()
1036 raster_chunks, _, _ = _create_chunks(raster, resolution, chunksize)
1037 collection = [
1038 dask.delayed(_zonal_aggregate_raster)(path, column, resolution, chunk, method)
1039 for chunk in raster_chunks
1040 ]
1041 result = dask.compute(collection)[0]
1042 return pd.concat(result)
1045def zonal_aggregate_polygons(
1046 path_a: Union[pathlib.Path, str],
1047 path_b: Union[pathlib.Path, str],
1048 column_a: str,
1049 column_b: str,
1050 like: xr.DataArray,
1051 resolution: float,
1052 method: Union[str, Callable],
1053 chunksize: int = 10_000,
1054) -> pd.DataFrame:
1055 """
1056 Compute a zonal aggregate of polygon data for (other) polygon geometries,
1057 e.g. a mean, mode, or percentile.
1059 Parameters
1060 ----------
1061 path_a : str or pathlib.Path
1062 path to OGR supported vector file (e.g. a shapefile)
1063 path_b : str or pathlib.Path
1064 path to OGR supported vector file (e.g. a shapefile)
1065 column_a : str
1066 column name of path_a. Defines zones of aggregation.
1067 column_b : str
1068 column name of path_b. Data to aggregate.
1069 like : xarray.DataArray with dims ("y", "x")
1070 Example DataArray of where the cells will be located. Used only for its
1071 x and y coordinates.
1072 resolution : float
1073 cellsize at which the rasterization, sampling, and area measurement occur.
1074 method: Union[str, Callable]
1075 Aggregation method.
1076 Anything that is acceptable by a pandas groupby aggregate:
1077 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html
1078 chunksize : int, optional
1079 The size of the chunksize. Used for both x and y dimension.
1081 Returns
1082 -------
1083 zonal_aggregates: pandas.DataFrame
1084 """
1085 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like)
1086 if not imod.util.spatial.is_divisor(dx, resolution):
1087 raise ValueError("resolution is not an (integer) divisor of dx")
1088 if not imod.util.spatial.is_divisor(dy, resolution):
1089 raise ValueError("resolution is not an (integer) divisor of dy")
1091 like_chunks, _, _ = _create_chunks(like, resolution, chunksize)
1092 collection = [
1093 dask.delayed(_zonal_aggregate_polygons)(
1094 path_a, path_b, column_a, column_b, resolution, chunk, method
1095 )
1096 for chunk in like_chunks
1097 ]
1098 result = dask.compute(collection)[0]
1099 return pd.concat(result)