Coverage for C:\src\imod-python\imod\prepare\spatial.py: 92%

360 statements  

« prev     ^ index     » next       coverage.py v7.4.4, created at 2024-04-08 13:27 +0200

1import pathlib 

2from typing import TYPE_CHECKING, Callable, Union 

3 

4import dask 

5import numba 

6import numpy as np 

7import pandas as pd 

8import scipy.ndimage 

9import xarray as xr 

10 

11import imod 

12from imod.prepare import common, pcg 

13from imod.util.imports import MissingOptionalModule 

14from imod.util.spatial import _polygonize 

15 

16# since rasterio is a big dependency that is sometimes hard to install 

17# and not always required, we made this an optional dependency 

18try: 

19 import rasterio 

20 import rasterio.features 

21 import rasterio.warp 

22except ImportError: 

23 rasterio = MissingOptionalModule("rasterio") 

24 

25if TYPE_CHECKING: 

26 import geopandas as gpd 

27 

28def round_extent(extent, cellsize): 

29 """Increases the extent until all sides lie on a coordinate 

30 divisible by cellsize.""" 

31 xmin, ymin, xmax, ymax = extent 

32 xmin = np.floor(xmin / cellsize) * cellsize 

33 ymin = np.floor(ymin / cellsize) * cellsize 

34 xmax = np.ceil(xmax / cellsize) * cellsize 

35 ymax = np.ceil(ymax / cellsize) * cellsize 

36 return xmin, ymin, xmax, ymax 

37 

38 

39def round_z(z_extent, dz): 

40 """Increases the extent until all sides lie on a coordinate 

41 divisible by dz.""" 

42 zmin, zmax = z_extent 

43 zmin = np.floor(zmin / dz) * dz 

44 zmax = np.ceil(zmax / dz) * dz 

45 return zmin, zmax 

46 

47 

48def _fill_np(data, invalid): 

49 """Basic nearest neighbour interpolation""" 

50 # see: https://stackoverflow.com/questions/5551286/filling-gaps-in-a-numpy-array 

51 ind = scipy.ndimage.distance_transform_edt( 

52 invalid, return_distances=False, return_indices=True 

53 ) 

54 return data[tuple(ind)] 

55 

56 

57def fill(da, invalid=None, by=None): 

58 """ 

59 Replace the value of invalid ``da`` cells (indicated by ``invalid``) 

60 using basic nearest neighbour interpolation. 

61 

62 Parameters 

63 ---------- 

64 da: xr.DataArray with gaps 

65 array containing missing value 

66 by: str, optional 

67 dimension over which the array will be filled, one by one. 

68 See the examples. 

69 

70 invalid: xr.DataArray 

71 a binary array of same shape as ``da``. 

72 data value are replaced where invalid is True 

73 If None (default), uses: `invalid = np.isnan(data)` 

74 

75 Returns 

76 ------- 

77 xarray.DataArray 

78 with the same coordinates as the input. 

79 

80 Examples 

81 -------- 

82 

83 A common use case is filling holes in a DataArray, filling it with the 

84 value of its nearest (valid) neighbor: 

85 

86 >>> filled = imod.prepare.fill(da) 

87 

88 In case of a tie (e.g. neighbors in x and y are both one cell removed), the 

89 neighbor in the last dimension is chosen (for rasters, that's generally x). 

90 

91 A typical use case is filling a 3D array (layer, y, x), but only in the 

92 horizontal dimensions. The ``by`` keyword can be used to do this: 

93 

94 >>> filled = imod.prepare.fill(da, by="layer") 

95 

96 In this case, the array is filled by one layer at a time. 

97 """ 

98 

99 out = xr.full_like(da, np.nan) 

100 if invalid is None: 

101 invalid = np.isnan(da) 

102 if by: 

103 for coordvalue in da[by]: 

104 d = {by: coordvalue} 

105 out.sel(d)[...] = _fill_np(da.sel(d).values, invalid.sel(d).values) 

106 else: 

107 out.values = _fill_np(da.values, invalid.values) 

108 

109 return out 

110 

111 

112def laplace_interpolate( 

113 source, ibound=None, close=0.01, mxiter=5, iter1=50, relax=0.98 

114): 

115 """ 

116 Fills gaps in `source` by interpolating from existing values using Laplace 

117 interpolation. 

118 

119 Parameters 

120 ---------- 

121 source : xr.DataArray of floats with dims (y, x) 

122 Data values to interpolate. 

123 ibound : xr.DataArray of bool with dims (y, x) 

124 Precomputed array which marks where to interpolate. 

125 close : float 

126 Closure criteration of iterative solver. Should be one to two orders 

127 of magnitude smaller than desired accuracy. 

128 mxiter : int 

129 Outer iterations of iterative solver. 

130 iter1 : int 

131 Inner iterations of iterative solver. Should not exceed 50. 

132 relax : float 

133 Iterative solver relaxation parameter. Should be between 0 and 1. 

134 

135 Returns 

136 ------- 

137 interpolated : xr.DataArray with dims (y, x) 

138 source, with interpolated values where ibound equals 1 

139 """ 

140 solver = pcg.PreconditionedConjugateGradientSolver( 

141 close, close * 1.0e6, mxiter, iter1, relax 

142 ) 

143 

144 if not source.dims == ("y", "x"): 

145 raise ValueError('source dims must be ("y", "x")') 

146 

147 # expand dims to make 3d 

148 source3d = source.expand_dims("layer") 

149 if ibound is None: 

150 iboundv = xr.full_like(source3d, 1, dtype=np.int32).values 

151 else: 

152 if not ibound.dims == ("y", "x"): 

153 raise ValueError('ibound dims must be ("y", "x")') 

154 if not ibound.shape == source.shape: 

155 raise ValueError("ibound and source must have the same shape") 

156 iboundv = ibound.expand_dims("layer").astype(np.int32).values 

157 

158 has_data = source3d.notnull().values 

159 iboundv[has_data] = -1 

160 hnew = source3d.fillna(0.0).values.astype( 

161 np.float64 

162 ) # Set start interpolated estimate to 0.0 

163 

164 shape = iboundv.shape 

165 nlay, nrow, ncol = shape 

166 nodes = nlay * nrow * ncol 

167 # Allocate work arrays 

168 # Not really used now, but might come in handy to implements weights 

169 cc = np.ones(shape) 

170 cr = np.ones(shape) 

171 cv = np.ones(shape) 

172 rhs = np.zeros(shape) 

173 hcof = np.zeros(shape) 

174 # Solver work arrays 

175 res = np.zeros(nodes) 

176 cd = np.zeros(nodes) 

177 v = np.zeros(nodes) 

178 ss = np.zeros(nodes) 

179 p = np.zeros(nodes) 

180 

181 # Picard iteration 

182 converged = False 

183 outer_iteration = 0 

184 while not converged and outer_iteration < mxiter: 

185 # Mutates hnew 

186 converged = solver.solve( 

187 hnew=hnew, 

188 cc=cc, 

189 cr=cr, 

190 cv=cv, 

191 ibound=iboundv, 

192 rhs=rhs, 

193 hcof=hcof, 

194 res=res, 

195 cd=cd, 

196 v=v, 

197 ss=ss, 

198 p=p, 

199 ) 

200 outer_iteration += 1 

201 else: 

202 if not converged: 

203 raise RuntimeError("Failed to converge") 

204 

205 hnew[iboundv == 0] = np.nan 

206 return source.copy(data=hnew[0]) 

207 

208 

209def rasterize(geodataframe, like, column=None, fill=np.nan, **kwargs): 

210 """ 

211 Rasterize a geopandas GeoDataFrame onto the given 

212 xarray coordinates. 

213 

214 Parameters 

215 ---------- 

216 geodataframe : geopandas.GeoDataFrame 

217 column : str, int, float 

218 column name of geodataframe to burn into raster 

219 like : xarray.DataArray 

220 Example DataArray. The rasterized result will match the shape and 

221 coordinates of this DataArray. 

222 fill : float, int 

223 Fill value for nodata areas. Optional, default value is np.nan. 

224 kwargs : additional keyword arguments for rasterio.features.rasterize. 

225 See: https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize 

226 

227 Returns 

228 ------- 

229 rasterized : xarray.DataArray 

230 Vector data rasterized. Matches shape and coordinates of ``like``. 

231 """ 

232 

233 if column is not None: 

234 shapes = [ 

235 (geom, value) 

236 for geom, value in zip(geodataframe.geometry, geodataframe[column]) 

237 ] 

238 else: 

239 shapes = [geom for geom in geodataframe.geometry] 

240 

241 # shapes must be an iterable 

242 try: 

243 iter(shapes) 

244 except TypeError: 

245 shapes = (shapes,) 

246 

247 raster = rasterio.features.rasterize( 

248 shapes, 

249 out_shape=like.shape, 

250 fill=fill, 

251 transform=imod.util.spatial.transform(like), 

252 **kwargs, 

253 ) 

254 

255 return xr.DataArray(raster, like.coords, like.dims) 

256 

257 

258def polygonize(da: xr.DataArray) -> "gpd.GeoDataFrame": 

259 """ 

260 Polygonize a 2D-DataArray into a GeoDataFrame of polygons. 

261 

262 Parameters 

263 ---------- 

264 da : xr.DataArray 

265 

266 Returns 

267 ------- 

268 polygonized : geopandas.GeoDataFrame 

269 """ 

270 return _polygonize(da) 

271 

272 

273def _handle_dtype(dtype, nodata): 

274 # Largely taken from rasterio.dtypes 

275 # https://github.com/mapbox/rasterio/blob/master/rasterio/dtypes.py 

276 # Not supported: 

277 # GDT_CInt16 = 8, GDT_CInt32 = 9, GDT_CFloat32 = 10, GDT_CFloat64 = 11 

278 dtype_mapping = { 

279 "uint8": 1, # GDT_Byte 

280 "uint16": 2, # GDT_Uint16 

281 "int16": 3, # GDT_Int16 

282 "uint32": 4, # GDT_Uint32 

283 "int32": 5, # GDT_Int32 

284 "float32": 6, # GDT_Float32 

285 "float64": 7, # GDT_Float64 

286 } 

287 dtype_ranges = { 

288 "uint8": (0, 255), 

289 "uint16": (0, 65535), 

290 "int16": (-32768, 32767), 

291 "uint32": (0, 4294967295), 

292 "int32": (-2147483648, 2147483647), 

293 "float32": (-3.4028235e38, 3.4028235e38), 

294 "float64": (-1.7976931348623157e308, 1.7976931348623157e308), 

295 } 

296 

297 def format_invalid(str_dtype): 

298 str_dtypes = dtype_mapping.keys() 

299 return "Invalid dtype: {0}, must be one of: {1}".format( 

300 str_dtype, ", ".join(str_dtypes) 

301 ) 

302 

303 if dtype is np.dtype(np.int64): 

304 dtype = np.int32 

305 

306 str_dtype = str(np.dtype(dtype)) 

307 if str_dtype not in dtype_mapping.keys(): 

308 raise ValueError(format_invalid(str_dtype)) 

309 gdal_dtype = dtype_mapping[str_dtype] 

310 

311 if nodata is None: 

312 if np.issubdtype(dtype, np.integer): 

313 # Default to lowest value in case of integers 

314 nodata = dtype_ranges[str_dtype][0] 

315 elif np.issubdtype(dtype, np.floating): 

316 # Default to NaN in case of floats 

317 nodata = np.nan 

318 else: 

319 lower, upper = dtype_ranges[str_dtype] 

320 if nodata < lower or nodata > upper: 

321 raise ValueError(f"Nodata value {nodata} is out of bounds for {str_dtype}") 

322 

323 return gdal_dtype, nodata 

324 

325 

326def gdal_rasterize( 

327 path, 

328 column, 

329 like=None, 

330 nodata=None, 

331 dtype=None, 

332 spatial_reference=None, 

333 all_touched=False, 

334): 

335 """ 

336 Use GDAL to rasterize a vector file into an xarray.DataArray. 

337 

338 Can be significantly more efficient than rasterize. This doesn't load the 

339 vector data into a GeoDataFrame and loops over the individual shapely 

340 geometries like rasterio.rasterize does, but loops over the features within 

341 GDAL instead. 

342 

343 Parameters 

344 ---------- 

345 path : str or pathlib.Path 

346 path to OGR supported vector file (e.g. a shapefile) 

347 column : str 

348 column name of column to burn into raster 

349 like : xr.DataArray, optional 

350 example of raster 

351 nodata : int, float; optional 

352 dtype : numpy.dtype, optional 

353 spatial_reference : dict, optional 

354 Optional dict to avoid allocating the like DataArray. Used if template 

355 is None. Dict has keys "bounds" and "cellsizes", with: 

356 

357 * bounds = (xmin, xmax, ymin, ymax) 

358 * cellsizes = (dx, dy) 

359 all_touched : bool 

360 If True: all pixels touched by lines or polygons will be updated, not 

361 just those on the line render path, or whose center point is within the 

362 polygon. Default value is False. 

363 

364 Returns 

365 ------- 

366 rasterized : xr.DataArray 

367 """ 

368 from osgeo import gdal, ogr 

369 

370 if isinstance(path, pathlib.Path): 

371 p = path 

372 path = str(p) 

373 else: 

374 p = pathlib.Path(path) 

375 if not p.exists(): 

376 raise FileNotFoundError(f"No such file: {path}") 

377 

378 if dtype is None: 

379 if like is None: 

380 raise ValueError("If ``like`` is not provided, ``dtype`` has to be given") 

381 else: 

382 dtype = like.dtype 

383 gdal_dtype, nodata = _handle_dtype(dtype, nodata) 

384 

385 # Exceptions will get raised on anything >= gdal.CE_Failure 

386 gdal.UseExceptions() 

387 

388 # An attempt at decent errors 

389 class GdalErrorHandler: 

390 def __init__(self): 

391 self.err_level = gdal.CE_None 

392 self.err_no = 0 

393 self.err_msg = "" 

394 

395 def handler(self, err_level, err_no, err_msg): 

396 self.err_level = err_level 

397 self.err_no = err_no 

398 self.err_msg = err_msg 

399 

400 error = GdalErrorHandler() 

401 handler = error.handler 

402 gdal.PushErrorHandler(handler) 

403 

404 # Get spatial data from template 

405 if like is not None: 

406 dx, xmin, _, dy, _, ymax = imod.util.spatial.spatial_reference(like) 

407 nrow, ncol = like.shape 

408 else: 

409 cellsizes = spatial_reference["cellsizes"] 

410 bounds = spatial_reference["bounds"] 

411 dx, dy = cellsizes 

412 if not isinstance(dx, (int, float)) or not isinstance(dy, (int, float)): 

413 raise ValueError("Cannot rasterize to a non-equidistant grid") 

414 coords = imod.util.spatial._xycoords(bounds, cellsizes) 

415 xmin, _, _, ymax = bounds 

416 nrow = coords["y"].size 

417 ncol = coords["x"].size 

418 dims = ("y", "x") 

419 

420 # File will be closed when vector is dereferenced, after return 

421 vector = ogr.Open(path) 

422 vector_layer = vector.GetLayer() 

423 

424 memory_driver = gdal.GetDriverByName("MEM") 

425 memory_raster = memory_driver.Create("", ncol, nrow, 1, gdal_dtype) 

426 memory_raster.SetGeoTransform([xmin, dx, 0, ymax, 0, dy]) 

427 memory_band = memory_raster.GetRasterBand(1) 

428 memory_band.SetNoDataValue(nodata) 

429 memory_band.Fill(nodata) 

430 

431 options = [f"ATTRIBUTE={column}", f"ALL_TOUCHED={str(all_touched).upper()}"] 

432 gdal.RasterizeLayer(memory_raster, [1], vector_layer, None, None, [1], options) 

433 if error.err_level >= gdal.CE_Warning: 

434 message = error.err_msg 

435 if message.startswith( 

436 "Failed to fetch spatial reference on layer" 

437 ) and message.endswith("assuming matching coordinate systems."): 

438 pass 

439 else: 

440 raise RuntimeError("GDAL error: " + error.err_msg) 

441 

442 if like is not None: 

443 rasterized = like.copy(data=memory_raster.ReadAsArray()) 

444 else: 

445 rasterized = xr.DataArray(memory_raster.ReadAsArray(), coords, dims) 

446 

447 return rasterized 

448 

449 

450@numba.njit(cache=True) 

451def _cell_count(src, values, frequencies, nodata, *inds_weights): 

452 """ 

453 numba compiled function to count the number of src cells occuring in the dst 

454 cells. 

455 

456 Parameters 

457 ---------- 

458 src : np.array 

459 values : np.array 

460 work array to store the unique values 

461 frequencies : np.array 

462 work array to store the tallied counts 

463 nodata : int, float 

464 inds_weights : tuple of np.arrays 

465 Contains indices of dst, indices of src, and weights. 

466 

467 Returns 

468 ------- 

469 tuple of np.arrays 

470 

471 * row_indices 

472 * col_indices 

473 * values 

474 * frequencies 

475 """ 

476 jj, blocks_iy, blocks_weights_y, kk, blocks_ix, blocks_weights_x = inds_weights 

477 

478 # Use list for dynamic allocation, since we don't know number of rows in 

479 # advance. 

480 row_indices = [] 

481 col_indices = [] 

482 value_list = [] 

483 count_list = [] 

484 

485 # j, k are indices of dst array 

486 # block_i contains indices of src array 

487 # block_w contains weights of src array 

488 for countj, j in enumerate(jj): 

489 block_iy = blocks_iy[countj] 

490 block_wy = blocks_weights_y[countj] 

491 for countk, k in enumerate(kk): 

492 block_ix = blocks_ix[countk] 

493 block_wx = blocks_weights_x[countk] 

494 

495 # TODO: use weights in frequency count, and area sum? 

496 # Since src is equidistant, normed weights are easy to calculate. 

497 

498 # Add the values and weights per cell in multi-dim block 

499 value_count = 0 

500 for iy, wy in zip(block_iy, block_wy): 

501 if iy < 0: 

502 break 

503 for ix, wx in zip(block_ix, block_wx): 

504 if ix < 0: 

505 break 

506 

507 v = src[iy, ix] 

508 if v == nodata: # Skip nodata cells 

509 continue 

510 # Work on a single destination cell 

511 # Count the number of polygon id's occuring in the cell 

512 # a new row per id 

513 found = False 

514 for i in range(value_count): 

515 if v == values[i]: 

516 frequencies[i] += 1 

517 found = True 

518 break 

519 if not found: 

520 values[value_count] = v 

521 frequencies[value_count] = 1 

522 value_count += 1 

523 # Add a new entry 

524 row_indices.append(j) 

525 col_indices.append(k) 

526 

527 # Store for output 

528 value_list.extend(values[:value_count]) 

529 count_list.extend(frequencies[:value_count]) 

530 

531 # reset storage 

532 values[:value_count] = 0 

533 frequencies[:value_count] = 0 

534 

535 # Cast to numpy arrays 

536 row_i_arr = np.array(row_indices) 

537 col_i_arr = np.array(col_indices) 

538 value_arr = np.array(value_list) 

539 count_arr = np.array(count_list) 

540 

541 return row_i_arr, col_i_arr, value_arr, count_arr 

542 

543 

544def _celltable(path, column, resolution, like, rowstart=0, colstart=0): 

545 """ 

546 Returns a table of cell indices (row, column) with feature ID, and feature 

547 area within cell. Essentially returns a COO sparse matrix, but with 

548 duplicate values per cell, since more than one geometry may be present. 

549 

550 The feature area within the cell is approximated by first rasterizing the 

551 feature, and then counting the number of occuring cells. This means the 

552 accuracy of the area depends on the resolution of the rasterization step. 

553 

554 Parameters 

555 ---------- 

556 path : str or pathlib.Path 

557 path to OGR supported vector file (e.g. a shapefile) 

558 column : str 

559 column name of column to burn into raster 

560 resolution : float 

561 cellsize at which the rasterization, and determination of area within 

562 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m). 

563 like : xarray.DataArray 

564 Example DataArray of where the cells will be located. Used only for the 

565 coordinates. 

566 

567 Returns 

568 ------- 

569 cell_table : pandas.DataFrame 

570 """ 

571 # Avoid side-effects 

572 like = like.copy(deep=False) 

573 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like) 

574 dx = resolution 

575 dy = -dx 

576 nodata = -1 

577 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)} 

578 

579 rasterized = gdal_rasterize( 

580 path, column, nodata=nodata, dtype=np.int32, spatial_reference=spatial_reference 

581 ) 

582 

583 # Make sure the coordinates are increasing. 

584 dims = ("y", "x") 

585 rasterized, _ = common._increasing_dims(rasterized, dims) 

586 like, flip_dst = common._increasing_dims(like, dims) 

587 

588 dst_coords = [imod.prepare.common._coord(like, dim) for dim in ("y", "x")] 

589 src_coords = [imod.prepare.common._coord(rasterized, dim) for dim in ("y", "x")] 

590 # Determine weights for every regrid dimension, and alloc_len, 

591 # the maximum number of src cells that may end up in a single dst cell 

592 inds_weights = [] 

593 alloc_len = 1 

594 for src_x, dst_x in zip(src_coords, dst_coords): 

595 size, i_w = imod.prepare.common._weights_1d(src_x, dst_x) 

596 for elem in i_w: 

597 inds_weights.append(elem) 

598 alloc_len *= size 

599 

600 # Pre-allocate work arrays 

601 values = np.full(alloc_len, 0) 

602 frequencies = np.full(alloc_len, 0) 

603 rows, cols, values, counts = _cell_count( 

604 rasterized.values, values, frequencies, nodata, *inds_weights 

605 ) 

606 

607 if "y" in flip_dst: 

608 rows = (like["y"].size - 1) - rows 

609 if "x" in flip_dst: 

610 cols = (like["x"].size - 1) - cols 

611 

612 df = pd.DataFrame() 

613 df["row_index"] = rows + rowstart 

614 df["col_index"] = cols + colstart 

615 df[column] = values 

616 df["area"] = counts * (dx * dx) 

617 

618 return df 

619 

620 

621def _create_chunks(like, resolution, chunksize): 

622 """ 

623 Cuts data into chunksize by chunksize. 

624 

625 Parameters 

626 ---------- 

627 like : xarray.DataArray 

628 resolution : float 

629 chunksize : int 

630 

631 Returns 

632 ------- 

633 chunks : list of xr.DataArray 

634 """ 

635 

636 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like) 

637 # Compute how many rows and columns are necessary for fine resolution 

638 nrow = int((ymax - ymin) / resolution) 

639 ncol = int((xmax - xmin) / resolution) 

640 # Find out where to cut 

641 x_starts = np.arange(0, ncol, chunksize) * resolution + xmin 

642 y_starts = np.arange(0, nrow, chunksize) * resolution + ymin 

643 # Searchsorted assumes the arrays are pre-sorted. 

644 x = np.sort(like.coords["x"].values) 

645 y = np.sort(like.coords["y"].values) 

646 # Get the matching indices of like. 

647 ix_starts = list(np.searchsorted(x, x_starts)) 

648 iy_starts = list(np.searchsorted(y, y_starts)) 

649 # Append None. In python's slice object, None denotes "slice including 

650 # first/last element" 

651 ix_ends = ix_starts[1:] + [None] 

652 iy_ends = iy_starts[1:] + [None] 

653 # Use xarray to grab the chunks. The chunks have x and y coordinates. 

654 # These will inform GDAL on which part to rasterize. 

655 # GDAL will only rasterize within the boundaries of the chunks, so there's 

656 # no need to clip the shapefile beforehand. 

657 chunks = [] 

658 rowstarts = [] 

659 colstarts = [] 

660 for j0, j1 in zip(iy_starts, iy_ends): 

661 for i0, i1 in zip(ix_starts, ix_ends): 

662 chunks.append(like.isel(y=slice(j0, j1), x=slice(i0, i1))) 

663 rowstarts.append(j0) 

664 colstarts.append(i0) 

665 return chunks, rowstarts, colstarts 

666 

667 

668def celltable(path, column, resolution, like, chunksize=1e4): 

669 r""" 

670 Process area of features by rasterizing in a chunkwise manner to limit 

671 memory usage. 

672 

673 Returns a table of cell indices (row, column) with for example feature ID, 

674 and feature area within cell. Essentially returns a COO sparse matrix, but 

675 with duplicate values per cell, since more than one geometry may be present. 

676 

677 The feature area within the cell is approximated by first rasterizing the 

678 feature, and then counting the number of occuring cells. This means the 

679 accuracy of the area depends on the cellsize of the rasterization step. 

680 

681 A celltable is returned, as a ``pandas.DataFrame``. It has the following 

682 columns: 

683 

684 1. ``"row_index"`` 

685 2. ``"col_index"`` 

686 3. the value of the ``column`` argument 

687 4. ``"area"`` 

688 

689 ``"row_index"`` and ``"col_index"`` are the indices of the like array in 

690 which the polygon is located. The ``column`` value holds the rasterized 

691 value of the specified column. ``"area"`` contains the area of the 

692 polygon within the cell. 

693 

694 The most convenient way of using this celltable is by specifying a feature 

695 ID as ``column``. After creating a celltable, ``pandas.DataFrame.merge()`` 

696 can be used to join additional data on this ID. Refer to the examples. 

697 

698 Parameters 

699 ---------- 

700 path : str or pathlib.Path 

701 path to OGR supported vector file (e.g. a shapefile) 

702 column : str 

703 column name of column to burn into raster 

704 resolution : float 

705 cellsize at which the rasterization, and determination of area within 

706 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m). 

707 like : xarray.DataArray 

708 Example DataArray of where the cells will be located. Used only for the 

709 coordinates. 

710 chunksize : int, optional 

711 The size of the chunksize. Used for both x and y dimension. 

712 

713 Returns 

714 ------- 

715 celltable : pandas.DataFrame 

716 

717 Examples 

718 -------- 

719 Assume we have a shapefile called ``waterways.shp`` and information on the 

720 model discretization is described by a ``like`` DataArray. The feature ID is 

721 provided by a column in the shapefile called "ID-code". Additionally, this 

722 shapefile also specifies bed hydraulic resistance (c0). For this specific 

723 discretization, we wish to calculate a conductance (area divided by 

724 hydraulic resistance). To do so, we: 

725 

726 1. create a ``celltable`` 

727 2. join the additional attributes (such as c0) 

728 3. compute the conductance per feature 

729 4. sum conductances per cell 

730 

731 Import the required packages. 

732 

733 >>> import imod 

734 >>> import geopandas as gpd 

735 

736 Generate the celltable. 

737 

738 >>> celltable = imod.prepare.celltable( 

739 path="waterways.shp", 

740 column="ID-code", 

741 resolution=0.5, 

742 like=like, 

743 ) 

744 

745 Load the shapefile with geopandas into a ``GeoDataFrame``. 

746 

747 >>> gdf = gpd.read_file("waterways.shp) 

748 

749 Select the relevant columns into a ``pandas.DataFrame`` and merge with the 

750 celltable. 

751 

752 >>> df = gdf[["ID-code", "c0"]] 

753 >>> joined = celltable.merge(gdf, on="ID-code") 

754 

755 We compute the conductance, and sum it per cell using ``pandas`` methods: 

756 

757 >>> joined["conductance"] = joined["area"] / joined["c0"] 

758 >>> summed_conductance = joined.groupby(["row_index", "col_index"], as_index=False)[ 

759 "conductance" 

760 ].sum() 

761 

762 Finally, turn the result into a DataArray so it can be used as model input: 

763 

764 >>> conductance = imod.prepare.rasterize_celltable( 

765 table=summed_conductance, 

766 column="conductance", 

767 like=like, 

768 ) 

769 

770 """ 

771 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like) 

772 if not imod.util.spatial.is_divisor(dx, resolution): 

773 raise ValueError("resolution is not an (integer) divisor of dx") 

774 if not imod.util.spatial.is_divisor(dy, resolution): 

775 raise ValueError("resolution is not an (integer) divisor of dy") 

776 

777 like_chunks, rowstarts, colstarts = _create_chunks(like, resolution, chunksize) 

778 collection = [ 

779 dask.delayed(_celltable)(path, column, resolution, chunk, rowstart, colstart) 

780 for chunk, rowstart, colstart in zip(like_chunks, rowstarts, colstarts) 

781 ] 

782 result = dask.compute(collection)[0] 

783 return pd.concat(result) 

784 

785 

786@numba.njit 

787def _burn_cells(raster, rows, cols, values): 

788 """ 

789 Burn values of sparse COO-matrix into raster. 

790 rows, cols, and values form a sparse matrix in coordinate format (COO) 

791 (also known as "ijv" or "triplet" format). 

792 

793 Parameters 

794 ---------- 

795 raster : np.array 

796 raster to burn values into. 

797 rows : np.array of integers 

798 row indices (i) 

799 cols : np.array of integers 

800 column indices (j) 

801 values : np.array of floats 

802 values to burn (v) 

803 """ 

804 for i, j, v in zip(rows, cols, values): 

805 raster[i, j] = v 

806 return raster 

807 

808 

809def rasterize_celltable(table, column, like): 

810 """ 

811 Rasterizes a table, such as produced by ``imod.prepare.spatial.celltable``. 

812 Before rasterization, multiple values should be grouped and aggregated per 

813 cell. Values will be overwritten otherwise. 

814 

815 Parameters 

816 ---------- 

817 like : xr.DataArray 

818 table : pandas.DataFrame 

819 with columns: "row_index", "col_index" 

820 column : str, int, float 

821 column name of values to rasterize 

822 

823 Returns 

824 ------- 

825 rasterized : xr.DataArray 

826 """ 

827 rows = table["row_index"].values 

828 cols = table["col_index"].values 

829 area = table[column].values 

830 dst = like.copy() 

831 dst.values = _burn_cells(dst.values, rows, cols, area) 

832 return dst 

833 

834 

835def _zonal_aggregate_raster( 

836 path: Union[str, pathlib.Path], 

837 column: str, 

838 resolution: float, 

839 raster: xr.DataArray, 

840 method: Union[str, Callable], 

841) -> pd.DataFrame: 

842 """ 

843 * Rasterize the polygon at given `resolution` 

844 * Sample the raster at the rasterized polygon cells 

845 * Store both in a dataframe, groupby and aggregate according to `method` 

846 """ 

847 data_dx, xmin, xmax, data_dy, ymin, ymax = imod.util.spatial.spatial_reference( 

848 raster 

849 ) 

850 dx = resolution 

851 dy = -dx 

852 nodata = -1 

853 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)} 

854 

855 rasterized = gdal_rasterize( 

856 path, 

857 column, 

858 nodata=nodata, 

859 dtype=np.int32, 

860 spatial_reference=spatial_reference, 

861 all_touched=True, 

862 ) 

863 ravelled = rasterized.values.ravel() 

864 

865 y = np.arange(ymax + 0.5 * dy, ymin, dy) 

866 x = np.arange(xmin + 0.5 * dx, xmax, dx) 

867 is_data = ravelled != nodata 

868 feature_id = ravelled[is_data] 

869 yy, xx = [a.ravel()[is_data] for a in np.meshgrid(y, x, indexing="ij")] 

870 

871 dims = ("y", "x") 

872 rasterized, _ = common._increasing_dims(rasterized, dims) 

873 raster, _ = common._increasing_dims(raster, dims) 

874 y_ind = ((yy - ymin) / abs(data_dy)).astype(int) 

875 x_ind = ((xx - xmin) / abs(data_dx)).astype(int) 

876 sample = raster.values[y_ind, x_ind] 

877 

878 df = pd.DataFrame({column: feature_id, "data": sample}) 

879 # Remove entries where the raster has nodata. 

880 # This may result in areas significantly smaller than the polygon geometry, 

881 # but should come in handy for weighting later? 

882 df = df[df["data"].notnull()] 

883 result = df.groupby(column, as_index=False).agg(["count", method]) 

884 # Reset index to set "column" as column again, make sure index is dropped 

885 result = result.reset_index(drop=True) 

886 # Compute the area from the counted number of cells 

887 result["data", "count"] *= resolution * resolution 

888 name = raster.name if raster.name else "aggregated" 

889 result.columns = [column, "area", name] 

890 return result 

891 

892 

893def _zonal_aggregate_polygons( 

894 path_a: Union[str, pathlib.Path], 

895 path_b: Union[str, pathlib.Path], 

896 column_a: str, 

897 column_b: str, 

898 resolution: float, 

899 like: xr.DataArray, 

900 method: Union[str, Callable], 

901) -> pd.DataFrame: 

902 """ 

903 * Rasterize a, rasterize b for the same domain 

904 * Store both in a dataframe, groupby and aggregate according to `method` 

905 """ 

906 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like) 

907 dx = resolution 

908 dy = -dx 

909 nodata = -1 

910 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)} 

911 

912 rasterized_a = gdal_rasterize( 

913 path_a, 

914 column_a, 

915 nodata=nodata, 

916 dtype=np.int32, 

917 spatial_reference=spatial_reference, 

918 all_touched=True, 

919 ) 

920 rasterized_b = gdal_rasterize( 

921 path_b, 

922 column_b, 

923 nodata=np.nan, 

924 dtype=np.float64, 

925 spatial_reference=spatial_reference, 

926 all_touched=True, 

927 ) 

928 is_data = ((rasterized_a != nodata) & (rasterized_b.notnull())).values 

929 a = rasterized_a.values[is_data].ravel() 

930 b = rasterized_b.values[is_data].ravel() 

931 df = pd.DataFrame({column_a: a, column_b: b}) 

932 # Remove entries where the raster has nodata. 

933 # This may result in areas significantly smaller than the polygon geometry, 

934 # but should come in handy for weighting later? 

935 result = df.groupby(column_a, as_index=False).agg(["count", method]) 

936 # Reset index to set "column_a" as column again, make sure index is dropped 

937 result = result.reset_index(drop=True) 

938 # Compute the area from the counted number of cells 

939 result[column_b, "count"] *= resolution * resolution 

940 result.columns = [column_a, "area", column_b] 

941 return result 

942 

943 

944def zonal_aggregate_raster( 

945 path: Union[pathlib.Path, str], 

946 column: str, 

947 raster: xr.DataArray, 

948 resolution: float, 

949 method: Union[str, Callable], 

950 chunksize: int = 1e4, 

951) -> pd.DataFrame: 

952 """ 

953 Compute a zonal aggregate of raster data for polygon geometries, e.g. a mean, 

954 mode, or percentile. 

955 

956 Parameters 

957 ---------- 

958 path : str or pathlib.Path 

959 path to OGR supported vector file (e.g. a shapefile). Defines zones 

960 of aggregation. 

961 column : str 

962 column name of path, integer IDs defining zones. 

963 raster : xarray.DataArray 

964 Raster data from which to sample and aggregate data 

965 resolution : float 

966 cellsize at which the rasterization of polygons and sampling occurs 

967 method : Union[str, Callable] 

968 Aggregation method. 

969 Anything that is acceptable by a pandas groupby aggregate: 

970 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html 

971 chunksize : int, optional 

972 The size of the chunksize. Used for both x and y dimension. 

973 

974 Returns 

975 ------- 

976 zonal_aggregates : pandas.DataFrame 

977 

978 Examples 

979 -------- 

980 

981 To compute the mean surface level at polygons of water bodies: 

982 

983 >>> import imod 

984 >>> surface_level = imod.rasterio.open("surface_level.tif") 

985 >>> df = imod.prepare.spatial.zonal_aggregate_raster( 

986 >>> path="water-bodies.shp", 

987 >>> column="id", 

988 >>> raster=surface_level, 

989 >>> resolution=1.0, 

990 >>> method="mean", 

991 >>> ) 

992 

993 For some functions, like the mode, a function should be passed instead: 

994 

995 >>> import pandas as pd 

996 >>> df = imod.prepare.spatial.zonal_aggregate_raster( 

997 >>> path="water-bodies.shp", 

998 >>> column="id", 

999 >>> raster=surface_level, 

1000 >>> resolution=1.0, 

1001 >>> method=pd.Series.mode, 

1002 >>> ) 

1003 """ 

1004 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(raster) 

1005 if not imod.util.spatial.is_divisor(dx, resolution): 

1006 raise ValueError("resolution is not an (integer) divisor of dx") 

1007 if not imod.util.spatial.is_divisor(dy, resolution): 

1008 raise ValueError("resolution is not an (integer) divisor of dy") 

1009 

1010 without_chunks = (raster.chunks is None) or ( 

1011 all(length == 1 for length in map(len, raster.chunks)) 

1012 ) 

1013 if without_chunks: 

1014 raster = raster.compute() 

1015 

1016 raster_chunks, _, _ = _create_chunks(raster, resolution, chunksize) 

1017 collection = [ 

1018 dask.delayed(_zonal_aggregate_raster)(path, column, resolution, chunk, method) 

1019 for chunk in raster_chunks 

1020 ] 

1021 result = dask.compute(collection)[0] 

1022 return pd.concat(result) 

1023 

1024 

1025def zonal_aggregate_polygons( 

1026 path_a: Union[pathlib.Path, str], 

1027 path_b: Union[pathlib.Path, str], 

1028 column_a: str, 

1029 column_b: str, 

1030 like: xr.DataArray, 

1031 resolution: float, 

1032 method: Union[str, Callable], 

1033 chunksize: int = 1e4, 

1034) -> pd.DataFrame: 

1035 """ 

1036 Compute a zonal aggregate of polygon data for (other) polygon geometries, 

1037 e.g. a mean, mode, or percentile. 

1038 

1039 Parameters 

1040 ---------- 

1041 path_a : str or pathlib.Path 

1042 path to OGR supported vector file (e.g. a shapefile) 

1043 path_b : str or pathlib.Path 

1044 path to OGR supported vector file (e.g. a shapefile) 

1045 column_a : str 

1046 column name of path_a. Defines zones of aggregation. 

1047 column_b : str 

1048 column name of path_b. Data to aggregate. 

1049 like : xarray.DataArray with dims ("y", "x") 

1050 Example DataArray of where the cells will be located. Used only for its 

1051 x and y coordinates. 

1052 resolution : float 

1053 cellsize at which the rasterization, sampling, and area measurement occur. 

1054 method: Union[str, Callable] 

1055 Aggregation method. 

1056 Anything that is acceptable by a pandas groupby aggregate: 

1057 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html 

1058 chunksize : int, optional 

1059 The size of the chunksize. Used for both x and y dimension. 

1060 

1061 Returns 

1062 ------- 

1063 zonal_aggregates: pandas.DataFrame 

1064 """ 

1065 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like) 

1066 if not imod.util.spatial.is_divisor(dx, resolution): 

1067 raise ValueError("resolution is not an (integer) divisor of dx") 

1068 if not imod.util.spatial.is_divisor(dy, resolution): 

1069 raise ValueError("resolution is not an (integer) divisor of dy") 

1070 

1071 like_chunks, _, _ = _create_chunks(like, resolution, chunksize) 

1072 collection = [ 

1073 dask.delayed(_zonal_aggregate_polygons)( 

1074 path_a, path_b, column_a, column_b, resolution, chunk, method 

1075 ) 

1076 for chunk in like_chunks 

1077 ] 

1078 result = dask.compute(collection)[0] 

1079 return pd.concat(result)