Coverage for C:\src\imod-python\imod\prepare\spatial.py: 92%

361 statements  

« prev     ^ index     » next       coverage.py v7.5.1, created at 2024-05-08 14:15 +0200

1import pathlib 

2from typing import TYPE_CHECKING, Callable, Union 

3 

4import dask 

5import numba 

6import numpy as np 

7import pandas as pd 

8import scipy.ndimage 

9import xarray as xr 

10from numpy.typing import DTypeLike 

11 

12import imod 

13from imod.prepare import common, pcg 

14from imod.util.imports import MissingOptionalModule 

15from imod.util.spatial import _polygonize 

16 

17# since rasterio is a big dependency that is sometimes hard to install 

18# and not always required, we made this an optional dependency 

19try: 

20 import rasterio 

21 import rasterio.features 

22 import rasterio.warp 

23except ImportError: 

24 rasterio = MissingOptionalModule("rasterio") 

25 

26if TYPE_CHECKING: 

27 import geopandas as gpd 

28 

29 

30def round_extent(extent, cellsize): 

31 """Increases the extent until all sides lie on a coordinate 

32 divisible by cellsize.""" 

33 xmin, ymin, xmax, ymax = extent 

34 xmin = np.floor(xmin / cellsize) * cellsize 

35 ymin = np.floor(ymin / cellsize) * cellsize 

36 xmax = np.ceil(xmax / cellsize) * cellsize 

37 ymax = np.ceil(ymax / cellsize) * cellsize 

38 return xmin, ymin, xmax, ymax 

39 

40 

41def round_z(z_extent, dz): 

42 """Increases the extent until all sides lie on a coordinate 

43 divisible by dz.""" 

44 zmin, zmax = z_extent 

45 zmin = np.floor(zmin / dz) * dz 

46 zmax = np.ceil(zmax / dz) * dz 

47 return zmin, zmax 

48 

49 

50def _fill_np(data, invalid): 

51 """Basic nearest neighbour interpolation""" 

52 # see: https://stackoverflow.com/questions/5551286/filling-gaps-in-a-numpy-array 

53 ind = scipy.ndimage.distance_transform_edt( 

54 invalid, return_distances=False, return_indices=True 

55 ) 

56 return data[tuple(ind)] 

57 

58 

59def fill(da, invalid=None, by=None): 

60 """ 

61 Replace the value of invalid ``da`` cells (indicated by ``invalid``) 

62 using basic nearest neighbour interpolation. 

63 

64 Parameters 

65 ---------- 

66 da: xr.DataArray with gaps 

67 array containing missing value 

68 by: str, optional 

69 dimension over which the array will be filled, one by one. 

70 See the examples. 

71 

72 invalid: xr.DataArray 

73 a binary array of same shape as ``da``. 

74 data value are replaced where invalid is True 

75 If None (default), uses: `invalid = np.isnan(data)` 

76 

77 Returns 

78 ------- 

79 xarray.DataArray 

80 with the same coordinates as the input. 

81 

82 Examples 

83 -------- 

84 

85 A common use case is filling holes in a DataArray, filling it with the 

86 value of its nearest (valid) neighbor: 

87 

88 >>> filled = imod.prepare.fill(da) 

89 

90 In case of a tie (e.g. neighbors in x and y are both one cell removed), the 

91 neighbor in the last dimension is chosen (for rasters, that's generally x). 

92 

93 A typical use case is filling a 3D array (layer, y, x), but only in the 

94 horizontal dimensions. The ``by`` keyword can be used to do this: 

95 

96 >>> filled = imod.prepare.fill(da, by="layer") 

97 

98 In this case, the array is filled by one layer at a time. 

99 """ 

100 

101 out = xr.full_like(da, np.nan) 

102 if invalid is None: 

103 invalid = np.isnan(da) 

104 if by: 

105 for coordvalue in da[by]: 

106 d = {by: coordvalue} 

107 out.sel(d)[...] = _fill_np(da.sel(d).values, invalid.sel(d).values) 

108 else: 

109 out.values = _fill_np(da.values, invalid.values) 

110 

111 return out 

112 

113 

114def laplace_interpolate( 

115 source, ibound=None, close=0.01, mxiter=5, iter1=50, relax=0.98 

116): 

117 """ 

118 Fills gaps in `source` by interpolating from existing values using Laplace 

119 interpolation. 

120 

121 Parameters 

122 ---------- 

123 source : xr.DataArray of floats with dims (y, x) 

124 Data values to interpolate. 

125 ibound : xr.DataArray of bool with dims (y, x) 

126 Precomputed array which marks where to interpolate. 

127 close : float 

128 Closure criteration of iterative solver. Should be one to two orders 

129 of magnitude smaller than desired accuracy. 

130 mxiter : int 

131 Outer iterations of iterative solver. 

132 iter1 : int 

133 Inner iterations of iterative solver. Should not exceed 50. 

134 relax : float 

135 Iterative solver relaxation parameter. Should be between 0 and 1. 

136 

137 Returns 

138 ------- 

139 interpolated : xr.DataArray with dims (y, x) 

140 source, with interpolated values where ibound equals 1 

141 """ 

142 solver = pcg.PreconditionedConjugateGradientSolver( 

143 close, close * 1.0e6, mxiter, iter1, relax 

144 ) 

145 

146 if not source.dims == ("y", "x"): 

147 raise ValueError('source dims must be ("y", "x")') 

148 

149 # expand dims to make 3d 

150 source3d = source.expand_dims("layer") 

151 if ibound is None: 

152 iboundv = xr.full_like(source3d, 1, dtype=np.int32).values 

153 else: 

154 if not ibound.dims == ("y", "x"): 

155 raise ValueError('ibound dims must be ("y", "x")') 

156 if not ibound.shape == source.shape: 

157 raise ValueError("ibound and source must have the same shape") 

158 iboundv = ibound.expand_dims("layer").astype(np.int32).values 

159 

160 has_data = source3d.notnull().values 

161 iboundv[has_data] = -1 

162 hnew = source3d.fillna(0.0).values.astype( 

163 np.float64 

164 ) # Set start interpolated estimate to 0.0 

165 

166 shape = iboundv.shape 

167 nlay, nrow, ncol = shape 

168 nodes = nlay * nrow * ncol 

169 # Allocate work arrays 

170 # Not really used now, but might come in handy to implements weights 

171 cc = np.ones(shape) 

172 cr = np.ones(shape) 

173 cv = np.ones(shape) 

174 rhs = np.zeros(shape) 

175 hcof = np.zeros(shape) 

176 # Solver work arrays 

177 res = np.zeros(nodes) 

178 cd = np.zeros(nodes) 

179 v = np.zeros(nodes) 

180 ss = np.zeros(nodes) 

181 p = np.zeros(nodes) 

182 

183 # Picard iteration 

184 converged = False 

185 outer_iteration = 0 

186 while not converged and outer_iteration < mxiter: 

187 # Mutates hnew 

188 converged = solver.solve( 

189 hnew=hnew, 

190 cc=cc, 

191 cr=cr, 

192 cv=cv, 

193 ibound=iboundv, 

194 rhs=rhs, 

195 hcof=hcof, 

196 res=res, 

197 cd=cd, 

198 v=v, 

199 ss=ss, 

200 p=p, 

201 ) 

202 outer_iteration += 1 

203 else: 

204 if not converged: 

205 raise RuntimeError("Failed to converge") 

206 

207 hnew[iboundv == 0] = np.nan 

208 return source.copy(data=hnew[0]) 

209 

210 

211def rasterize(geodataframe, like, column=None, fill=np.nan, **kwargs): 

212 """ 

213 Rasterize a geopandas GeoDataFrame onto the given 

214 xarray coordinates. 

215 

216 Parameters 

217 ---------- 

218 geodataframe : geopandas.GeoDataFrame 

219 column : str, int, float 

220 column name of geodataframe to burn into raster 

221 like : xarray.DataArray 

222 Example DataArray. The rasterized result will match the shape and 

223 coordinates of this DataArray. 

224 fill : float, int 

225 Fill value for nodata areas. Optional, default value is np.nan. 

226 kwargs : additional keyword arguments for rasterio.features.rasterize. 

227 See: https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize 

228 

229 Returns 

230 ------- 

231 rasterized : xarray.DataArray 

232 Vector data rasterized. Matches shape and coordinates of ``like``. 

233 """ 

234 

235 if column is not None: 

236 shapes = list(zip(geodataframe.geometry, geodataframe[column])) 

237 else: 

238 shapes = list(geodataframe.geometry) 

239 

240 # shapes must be an iterable 

241 try: 

242 iter(shapes) 

243 except TypeError: 

244 shapes = (shapes,) 

245 

246 raster = rasterio.features.rasterize( 

247 shapes, 

248 out_shape=like.shape, 

249 fill=fill, 

250 transform=imod.util.spatial.transform(like), 

251 **kwargs, 

252 ) 

253 

254 return xr.DataArray(raster, like.coords, like.dims) 

255 

256 

257def polygonize(da: xr.DataArray) -> "gpd.GeoDataFrame": 

258 """ 

259 Polygonize a 2D-DataArray into a GeoDataFrame of polygons. 

260 

261 Parameters 

262 ---------- 

263 da : xr.DataArray 

264 

265 Returns 

266 ------- 

267 polygonized : geopandas.GeoDataFrame 

268 """ 

269 return _polygonize(da) 

270 

271 

272def _handle_dtype(dtype, nodata): 

273 # Largely taken from rasterio.dtypes 

274 # https://github.com/mapbox/rasterio/blob/master/rasterio/dtypes.py 

275 # Not supported: 

276 # GDT_CInt16 = 8, GDT_CInt32 = 9, GDT_CFloat32 = 10, GDT_CFloat64 = 11 

277 dtype_mapping = { 

278 "uint8": 1, # GDT_Byte 

279 "uint16": 2, # GDT_Uint16 

280 "int16": 3, # GDT_Int16 

281 "uint32": 4, # GDT_Uint32 

282 "int32": 5, # GDT_Int32 

283 "float32": 6, # GDT_Float32 

284 "float64": 7, # GDT_Float64 

285 } 

286 dtype_ranges = { 

287 "uint8": (0, 255), 

288 "uint16": (0, 65535), 

289 "int16": (-32768, 32767), 

290 "uint32": (0, 4294967295), 

291 "int32": (-2147483648, 2147483647), 

292 "float32": (-3.4028235e38, 3.4028235e38), 

293 "float64": (-1.7976931348623157e308, 1.7976931348623157e308), 

294 } 

295 

296 def format_invalid(str_dtype): 

297 str_dtypes = dtype_mapping.keys() 

298 return "Invalid dtype: {0}, must be one of: {1}".format( 

299 str_dtype, ", ".join(str_dtypes) 

300 ) 

301 

302 if dtype is np.dtype(np.int64): 

303 dtype = np.int32 

304 

305 str_dtype = str(np.dtype(dtype)) 

306 if str_dtype not in dtype_mapping.keys(): 

307 raise ValueError(format_invalid(str_dtype)) 

308 gdal_dtype = dtype_mapping[str_dtype] 

309 

310 if nodata is None: 

311 if np.issubdtype(dtype, np.integer): 

312 # Default to lowest value in case of integers 

313 nodata = dtype_ranges[str_dtype][0] 

314 elif np.issubdtype(dtype, np.floating): 

315 # Default to NaN in case of floats 

316 nodata = np.nan 

317 else: 

318 lower, upper = dtype_ranges[str_dtype] 

319 if nodata < lower or nodata > upper: 

320 raise ValueError(f"Nodata value {nodata} is out of bounds for {str_dtype}") 

321 

322 return gdal_dtype, nodata 

323 

324 

325def gdal_rasterize( 

326 path, 

327 column, 

328 like=None, 

329 nodata=None, 

330 dtype=None, 

331 spatial_reference=None, 

332 all_touched=False, 

333): 

334 """ 

335 Use GDAL to rasterize a vector file into an xarray.DataArray. 

336 

337 Can be significantly more efficient than rasterize. This doesn't load the 

338 vector data into a GeoDataFrame and loops over the individual shapely 

339 geometries like rasterio.rasterize does, but loops over the features within 

340 GDAL instead. 

341 

342 Parameters 

343 ---------- 

344 path : str or pathlib.Path 

345 path to OGR supported vector file (e.g. a shapefile) 

346 column : str 

347 column name of column to burn into raster 

348 like : xr.DataArray, optional 

349 example of raster 

350 nodata : int, float; optional 

351 dtype : numpy.dtype, optional 

352 spatial_reference : dict, optional 

353 Optional dict to avoid allocating the like DataArray. Used if template 

354 is None. Dict has keys "bounds" and "cellsizes", with: 

355 

356 * bounds = (xmin, xmax, ymin, ymax) 

357 * cellsizes = (dx, dy) 

358 all_touched : bool 

359 If True: all pixels touched by lines or polygons will be updated, not 

360 just those on the line render path, or whose center point is within the 

361 polygon. Default value is False. 

362 

363 Returns 

364 ------- 

365 rasterized : xr.DataArray 

366 """ 

367 from osgeo import gdal, ogr 

368 

369 if isinstance(path, pathlib.Path): 

370 p = path 

371 path = str(p) 

372 else: 

373 p = pathlib.Path(path) 

374 if not p.exists(): 

375 raise FileNotFoundError(f"No such file: {path}") 

376 

377 if dtype is None: 

378 if like is None: 

379 raise ValueError("If ``like`` is not provided, ``dtype`` has to be given") 

380 else: 

381 dtype = like.dtype 

382 gdal_dtype, nodata = _handle_dtype(dtype, nodata) 

383 

384 # Exceptions will get raised on anything >= gdal.CE_Failure 

385 gdal.UseExceptions() 

386 

387 # An attempt at decent errors 

388 class GdalErrorHandler: 

389 def __init__(self): 

390 self.err_level = gdal.CE_None 

391 self.err_no = 0 

392 self.err_msg = "" 

393 

394 def handler(self, err_level, err_no, err_msg): 

395 self.err_level = err_level 

396 self.err_no = err_no 

397 self.err_msg = err_msg 

398 

399 error = GdalErrorHandler() 

400 handler = error.handler 

401 gdal.PushErrorHandler(handler) 

402 

403 # Get spatial data from template 

404 if like is not None: 

405 dx, xmin, _, dy, _, ymax = imod.util.spatial.spatial_reference(like) 

406 nrow, ncol = like.shape 

407 else: 

408 cellsizes = spatial_reference["cellsizes"] 

409 bounds = spatial_reference["bounds"] 

410 dx, dy = cellsizes 

411 if not isinstance(dx, (int, float)) or not isinstance(dy, (int, float)): 

412 raise ValueError("Cannot rasterize to a non-equidistant grid") 

413 coords = imod.util.spatial._xycoords(bounds, cellsizes) 

414 xmin, _, _, ymax = bounds 

415 nrow = coords["y"].size 

416 ncol = coords["x"].size 

417 dims = ("y", "x") 

418 

419 # File will be closed when vector is dereferenced, after return 

420 vector = ogr.Open(path) 

421 vector_layer = vector.GetLayer() 

422 

423 memory_driver = gdal.GetDriverByName("MEM") 

424 memory_raster = memory_driver.Create("", ncol, nrow, 1, gdal_dtype) 

425 memory_raster.SetGeoTransform([xmin, dx, 0, ymax, 0, dy]) 

426 memory_band = memory_raster.GetRasterBand(1) 

427 memory_band.SetNoDataValue(nodata) 

428 memory_band.Fill(nodata) 

429 

430 options = [f"ATTRIBUTE={column}", f"ALL_TOUCHED={str(all_touched).upper()}"] 

431 gdal.RasterizeLayer(memory_raster, [1], vector_layer, None, None, [1], options) 

432 if error.err_level >= gdal.CE_Warning: 

433 message = error.err_msg 

434 if message.startswith( 

435 "Failed to fetch spatial reference on layer" 

436 ) and message.endswith("assuming matching coordinate systems."): 

437 pass 

438 else: 

439 raise RuntimeError("GDAL error: " + error.err_msg) 

440 

441 if like is not None: 

442 rasterized = like.copy(data=memory_raster.ReadAsArray()) 

443 else: 

444 rasterized = xr.DataArray(memory_raster.ReadAsArray(), coords, dims) 

445 

446 return rasterized 

447 

448 

449@numba.njit(cache=True) 

450def _cell_count(src, values, frequencies, nodata, *inds_weights): 

451 """ 

452 numba compiled function to count the number of src cells occuring in the dst 

453 cells. 

454 

455 Parameters 

456 ---------- 

457 src : np.array 

458 values : np.array 

459 work array to store the unique values 

460 frequencies : np.array 

461 work array to store the tallied counts 

462 nodata : int, float 

463 inds_weights : tuple of np.arrays 

464 Contains indices of dst, indices of src, and weights. 

465 

466 Returns 

467 ------- 

468 tuple of np.arrays 

469 

470 * row_indices 

471 * col_indices 

472 * values 

473 * frequencies 

474 """ 

475 jj, blocks_iy, blocks_weights_y, kk, blocks_ix, blocks_weights_x = inds_weights 

476 

477 # Use list for dynamic allocation, since we don't know number of rows in 

478 # advance. 

479 row_indices = [] 

480 col_indices = [] 

481 value_list = [] 

482 count_list = [] 

483 

484 # j, k are indices of dst array 

485 # block_i contains indices of src array 

486 # block_w contains weights of src array 

487 for countj, j in enumerate(jj): 

488 block_iy = blocks_iy[countj] 

489 block_wy = blocks_weights_y[countj] 

490 for countk, k in enumerate(kk): 

491 block_ix = blocks_ix[countk] 

492 block_wx = blocks_weights_x[countk] 

493 

494 # TODO: use weights in frequency count, and area sum? 

495 # Since src is equidistant, normed weights are easy to calculate. 

496 

497 # Add the values and weights per cell in multi-dim block 

498 value_count = 0 

499 for iy, wy in zip(block_iy, block_wy): 

500 if iy < 0: 

501 break 

502 for ix, wx in zip(block_ix, block_wx): 

503 if ix < 0: 

504 break 

505 

506 v = src[iy, ix] 

507 if v == nodata: # Skip nodata cells 

508 continue 

509 # Work on a single destination cell 

510 # Count the number of polygon id's occuring in the cell 

511 # a new row per id 

512 found = False 

513 for i in range(value_count): 

514 if v == values[i]: 

515 frequencies[i] += 1 

516 found = True 

517 break 

518 if not found: 

519 values[value_count] = v 

520 frequencies[value_count] = 1 

521 value_count += 1 

522 # Add a new entry 

523 row_indices.append(j) 

524 col_indices.append(k) 

525 

526 # Store for output 

527 value_list.extend(values[:value_count]) 

528 count_list.extend(frequencies[:value_count]) 

529 

530 # reset storage 

531 values[:value_count] = 0 

532 frequencies[:value_count] = 0 

533 

534 # Cast to numpy arrays 

535 row_i_arr = np.array(row_indices) 

536 col_i_arr = np.array(col_indices) 

537 value_arr = np.array(value_list) 

538 count_arr = np.array(count_list) 

539 

540 return row_i_arr, col_i_arr, value_arr, count_arr 

541 

542 

543def _celltable( 

544 path: str | pathlib.Path, 

545 column: str, 

546 resolution: int, 

547 like: xr.DataArray, 

548 dtype: DTypeLike, 

549 rowstart: int = 0, 

550 colstart: int = 0, 

551) -> pd.DataFrame: 

552 """ 

553 Returns a table of cell indices (row, column) with feature ID, and feature 

554 area within cell. Essentially returns a COO sparse matrix, but with 

555 duplicate values per cell, since more than one geometry may be present. 

556 

557 The feature area within the cell is approximated by first rasterizing the 

558 feature, and then counting the number of occuring cells. This means the 

559 accuracy of the area depends on the resolution of the rasterization step. 

560 

561 Parameters 

562 ---------- 

563 path : str or pathlib.Path 

564 path to OGR supported vector file (e.g. a shapefile) 

565 column : str 

566 column name of column to burn into raster 

567 resolution : float 

568 cellsize at which the rasterization, and determination of area within 

569 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m). 

570 like : xarray.DataArray 

571 Example DataArray of where the cells will be located. Used only for the 

572 coordinates. 

573 dtype: numpy.dtype 

574 datatype of data referred to with "column". 

575 

576 Returns 

577 ------- 

578 cell_table : pandas.DataFrame 

579 """ 

580 # Avoid side-effects 

581 like = like.copy(deep=False) 

582 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like) 

583 dx = resolution 

584 dy = -dx 

585 nodata = -1 

586 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)} 

587 

588 rasterized = gdal_rasterize( 

589 path, column, nodata=nodata, dtype=dtype, spatial_reference=spatial_reference 

590 ) 

591 

592 # Make sure the coordinates are increasing. 

593 dims = ("y", "x") 

594 rasterized, _ = common._increasing_dims(rasterized, dims) 

595 like, flip_dst = common._increasing_dims(like, dims) 

596 

597 dst_coords = [imod.prepare.common._coord(like, dim) for dim in ("y", "x")] 

598 src_coords = [imod.prepare.common._coord(rasterized, dim) for dim in ("y", "x")] 

599 # Determine weights for every regrid dimension, and alloc_len, 

600 # the maximum number of src cells that may end up in a single dst cell 

601 inds_weights = [] 

602 alloc_len = 1 

603 for src_x, dst_x in zip(src_coords, dst_coords): 

604 size, i_w = imod.prepare.common._weights_1d(src_x, dst_x) 

605 for elem in i_w: 

606 inds_weights.append(elem) 

607 alloc_len *= size 

608 

609 # Pre-allocate work arrays 

610 values = np.full(alloc_len, 0, dtype=dtype) 

611 frequencies = np.full(alloc_len, 0, dtype=dtype) 

612 rows, cols, values, counts = _cell_count( 

613 rasterized.values, values, frequencies, nodata, *inds_weights 

614 ) 

615 

616 if "y" in flip_dst: 

617 rows = (like["y"].size - 1) - rows 

618 if "x" in flip_dst: 

619 cols = (like["x"].size - 1) - cols 

620 

621 df = pd.DataFrame() 

622 df["row_index"] = rows + rowstart 

623 df["col_index"] = cols + colstart 

624 df[column] = values 

625 df["area"] = counts * (dx * dx) 

626 

627 return df 

628 

629 

630def _create_chunks(like, resolution, chunksize): 

631 """ 

632 Cuts data into chunksize by chunksize. 

633 

634 Parameters 

635 ---------- 

636 like : xarray.DataArray 

637 resolution : float 

638 chunksize : int 

639 

640 Returns 

641 ------- 

642 chunks : list of xr.DataArray 

643 """ 

644 

645 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like) 

646 # Compute how many rows and columns are necessary for fine resolution 

647 nrow = int((ymax - ymin) / resolution) 

648 ncol = int((xmax - xmin) / resolution) 

649 # Find out where to cut 

650 x_starts = np.arange(0, ncol, chunksize) * resolution + xmin 

651 y_starts = np.arange(0, nrow, chunksize) * resolution + ymin 

652 # Searchsorted assumes the arrays are pre-sorted. 

653 x = np.sort(like.coords["x"].values) 

654 y = np.sort(like.coords["y"].values) 

655 # Get the matching indices of like. 

656 ix_starts = list(np.searchsorted(x, x_starts)) 

657 iy_starts = list(np.searchsorted(y, y_starts)) 

658 # Append None. In python's slice object, None denotes "slice including 

659 # first/last element" 

660 ix_ends = ix_starts[1:] + [None] 

661 iy_ends = iy_starts[1:] + [None] 

662 # Use xarray to grab the chunks. The chunks have x and y coordinates. 

663 # These will inform GDAL on which part to rasterize. 

664 # GDAL will only rasterize within the boundaries of the chunks, so there's 

665 # no need to clip the shapefile beforehand. 

666 chunks = [] 

667 rowstarts = [] 

668 colstarts = [] 

669 for j0, j1 in zip(iy_starts, iy_ends): 

670 for i0, i1 in zip(ix_starts, ix_ends): 

671 chunks.append(like.isel(y=slice(j0, j1), x=slice(i0, i1))) 

672 rowstarts.append(j0) 

673 colstarts.append(i0) 

674 return chunks, rowstarts, colstarts 

675 

676 

677def celltable( 

678 path: str | pathlib.Path, 

679 column: str, 

680 resolution: int, 

681 like: xr.DataArray, 

682 dtype: DTypeLike = np.int32, 

683 chunksize: int = 10_000, 

684) -> pd.DataFrame: 

685 r""" 

686 Process area of features by rasterizing in a chunkwise manner to limit 

687 memory usage. 

688 

689 Returns a table of cell indices (row, column) with for example feature ID, 

690 and feature area within cell. Essentially returns a COO sparse matrix, but 

691 with duplicate values per cell, since more than one geometry may be present. 

692 

693 The feature area within the cell is approximated by first rasterizing the 

694 feature, and then counting the number of occuring cells. This means the 

695 accuracy of the area depends on the cellsize of the rasterization step. 

696 

697 A celltable is returned, as a ``pandas.DataFrame``. It has the following 

698 columns: 

699 

700 1. ``"row_index"`` 

701 2. ``"col_index"`` 

702 3. the value of the ``column`` argument 

703 4. ``"area"`` 

704 

705 ``"row_index"`` and ``"col_index"`` are the indices of the like array in 

706 which the polygon is located. The ``column`` value holds the rasterized 

707 value of the specified column. ``"area"`` contains the area of the 

708 polygon within the cell. 

709 

710 The most convenient way of using this celltable is by specifying a feature 

711 ID as ``column``. After creating a celltable, ``pandas.DataFrame.merge()`` 

712 can be used to join additional data on this ID. Refer to the examples. 

713 

714 Parameters 

715 ---------- 

716 path : str or pathlib.Path 

717 path to OGR supported vector file (e.g. a shapefile) 

718 column : str 

719 column name of column to burn into raster 

720 resolution : float 

721 cellsize at which the rasterization, and determination of area within 

722 cellsize occurs. Very small values are recommended (e.g. <= 0.5 m). 

723 like : xarray.DataArray 

724 Example DataArray of where the cells will be located. Used only for the 

725 coordinates. 

726 dtype: DtypeLike, optional 

727 datatype of data referred to with "column", defaults to 32-bit integer. 

728 chunksize : int, optional 

729 The size of the chunksize. Used for both x and y dimension. 

730 

731 Returns 

732 ------- 

733 celltable : pandas.DataFrame 

734 

735 Examples 

736 -------- 

737 Assume we have a shapefile called ``waterways.shp`` and information on the 

738 model discretization is described by a ``like`` DataArray. The feature ID is 

739 provided by a column in the shapefile called "ID-code". Additionally, this 

740 shapefile also specifies bed hydraulic resistance (c0). For this specific 

741 discretization, we wish to calculate a conductance (area divided by 

742 hydraulic resistance). To do so, we: 

743 

744 1. create a ``celltable`` 

745 2. join the additional attributes (such as c0) 

746 3. compute the conductance per feature 

747 4. sum conductances per cell 

748 

749 Import the required packages. 

750 

751 >>> import imod 

752 >>> import geopandas as gpd 

753 

754 Generate the celltable. 

755 

756 >>> celltable = imod.prepare.celltable( 

757 path="waterways.shp", 

758 column="ID-code", 

759 resolution=0.5, 

760 like=like, 

761 ) 

762 

763 Load the shapefile with geopandas into a ``GeoDataFrame``. 

764 

765 >>> gdf = gpd.read_file("waterways.shp) 

766 

767 Select the relevant columns into a ``pandas.DataFrame`` and merge with the 

768 celltable. 

769 

770 >>> df = gdf[["ID-code", "c0"]] 

771 >>> joined = celltable.merge(gdf, on="ID-code") 

772 

773 We compute the conductance, and sum it per cell using ``pandas`` methods: 

774 

775 >>> joined["conductance"] = joined["area"] / joined["c0"] 

776 >>> summed_conductance = joined.groupby(["row_index", "col_index"], as_index=False)[ 

777 "conductance" 

778 ].sum() 

779 

780 Finally, turn the result into a DataArray so it can be used as model input: 

781 

782 >>> conductance = imod.prepare.rasterize_celltable( 

783 table=summed_conductance, 

784 column="conductance", 

785 like=like, 

786 ) 

787 

788 """ 

789 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like) 

790 if not imod.util.spatial.is_divisor(dx, resolution): 

791 raise ValueError("resolution is not an (integer) divisor of dx") 

792 if not imod.util.spatial.is_divisor(dy, resolution): 

793 raise ValueError("resolution is not an (integer) divisor of dy") 

794 

795 like_chunks, rowstarts, colstarts = _create_chunks(like, resolution, chunksize) 

796 collection = [ 

797 dask.delayed(_celltable)( 

798 path, column, resolution, chunk, dtype, rowstart, colstart 

799 ) 

800 for chunk, rowstart, colstart in zip(like_chunks, rowstarts, colstarts) 

801 ] 

802 result = dask.compute(collection)[0] 

803 return pd.concat(result) 

804 

805 

806@numba.njit 

807def _burn_cells(raster, rows, cols, values): 

808 """ 

809 Burn values of sparse COO-matrix into raster. 

810 rows, cols, and values form a sparse matrix in coordinate format (COO) 

811 (also known as "ijv" or "triplet" format). 

812 

813 Parameters 

814 ---------- 

815 raster : np.array 

816 raster to burn values into. 

817 rows : np.array of integers 

818 row indices (i) 

819 cols : np.array of integers 

820 column indices (j) 

821 values : np.array of floats 

822 values to burn (v) 

823 """ 

824 for i, j, v in zip(rows, cols, values): 

825 raster[i, j] = v 

826 return raster 

827 

828 

829def rasterize_celltable(table, column, like): 

830 """ 

831 Rasterizes a table, such as produced by ``imod.prepare.spatial.celltable``. 

832 Before rasterization, multiple values should be grouped and aggregated per 

833 cell. Values will be overwritten otherwise. 

834 

835 Parameters 

836 ---------- 

837 like : xr.DataArray 

838 table : pandas.DataFrame 

839 with columns: "row_index", "col_index" 

840 column : str, int, float 

841 column name of values to rasterize 

842 

843 Returns 

844 ------- 

845 rasterized : xr.DataArray 

846 """ 

847 rows = table["row_index"].values 

848 cols = table["col_index"].values 

849 area = table[column].values 

850 dst = like.copy() 

851 dst.values = _burn_cells(dst.values, rows, cols, area) 

852 return dst 

853 

854 

855def _zonal_aggregate_raster( 

856 path: Union[str, pathlib.Path], 

857 column: str, 

858 resolution: float, 

859 raster: xr.DataArray, 

860 method: Union[str, Callable], 

861) -> pd.DataFrame: 

862 """ 

863 * Rasterize the polygon at given `resolution` 

864 * Sample the raster at the rasterized polygon cells 

865 * Store both in a dataframe, groupby and aggregate according to `method` 

866 """ 

867 data_dx, xmin, xmax, data_dy, ymin, ymax = imod.util.spatial.spatial_reference( 

868 raster 

869 ) 

870 dx = resolution 

871 dy = -dx 

872 nodata = -1 

873 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)} 

874 

875 rasterized = gdal_rasterize( 

876 path, 

877 column, 

878 nodata=nodata, 

879 dtype=np.int32, 

880 spatial_reference=spatial_reference, 

881 all_touched=True, 

882 ) 

883 ravelled = rasterized.values.ravel() 

884 

885 y = np.arange(ymax + 0.5 * dy, ymin, dy) 

886 x = np.arange(xmin + 0.5 * dx, xmax, dx) 

887 is_data = ravelled != nodata 

888 feature_id = ravelled[is_data] 

889 yy, xx = [a.ravel()[is_data] for a in np.meshgrid(y, x, indexing="ij")] 

890 

891 dims = ("y", "x") 

892 rasterized, _ = common._increasing_dims(rasterized, dims) 

893 raster, _ = common._increasing_dims(raster, dims) 

894 y_ind = ((yy - ymin) / abs(data_dy)).astype(int) 

895 x_ind = ((xx - xmin) / abs(data_dx)).astype(int) 

896 sample = raster.values[y_ind, x_ind] 

897 

898 df = pd.DataFrame({column: feature_id, "data": sample}) 

899 # Remove entries where the raster has nodata. 

900 # This may result in areas significantly smaller than the polygon geometry, 

901 # but should come in handy for weighting later? 

902 df = df[df["data"].notnull()] 

903 result = df.groupby(column, as_index=False).agg(["count", method]) 

904 # Reset index to set "column" as column again, make sure index is dropped 

905 result = result.reset_index(drop=True) 

906 # Compute the area from the counted number of cells 

907 result["data", "count"] *= resolution * resolution 

908 name = raster.name if raster.name else "aggregated" 

909 result.columns = [column, "area", name] 

910 return result 

911 

912 

913def _zonal_aggregate_polygons( 

914 path_a: Union[str, pathlib.Path], 

915 path_b: Union[str, pathlib.Path], 

916 column_a: str, 

917 column_b: str, 

918 resolution: float, 

919 like: xr.DataArray, 

920 method: Union[str, Callable], 

921) -> pd.DataFrame: 

922 """ 

923 * Rasterize a, rasterize b for the same domain 

924 * Store both in a dataframe, groupby and aggregate according to `method` 

925 """ 

926 _, xmin, xmax, _, ymin, ymax = imod.util.spatial.spatial_reference(like) 

927 dx = resolution 

928 dy = -dx 

929 nodata = -1 

930 spatial_reference = {"bounds": (xmin, xmax, ymin, ymax), "cellsizes": (dx, dy)} 

931 

932 rasterized_a = gdal_rasterize( 

933 path_a, 

934 column_a, 

935 nodata=nodata, 

936 dtype=np.int32, 

937 spatial_reference=spatial_reference, 

938 all_touched=True, 

939 ) 

940 rasterized_b = gdal_rasterize( 

941 path_b, 

942 column_b, 

943 nodata=np.nan, 

944 dtype=np.float64, 

945 spatial_reference=spatial_reference, 

946 all_touched=True, 

947 ) 

948 is_data = ((rasterized_a != nodata) & (rasterized_b.notnull())).values 

949 a = rasterized_a.values[is_data].ravel() 

950 b = rasterized_b.values[is_data].ravel() 

951 df = pd.DataFrame({column_a: a, column_b: b}) 

952 # Remove entries where the raster has nodata. 

953 # This may result in areas significantly smaller than the polygon geometry, 

954 # but should come in handy for weighting later? 

955 result = df.groupby(column_a, as_index=False).agg(["count", method]) 

956 # Reset index to set "column_a" as column again, make sure index is dropped 

957 result = result.reset_index(drop=True) 

958 # Compute the area from the counted number of cells 

959 result[column_b, "count"] *= resolution * resolution 

960 result.columns = [column_a, "area", column_b] 

961 return result 

962 

963 

964def zonal_aggregate_raster( 

965 path: Union[pathlib.Path, str], 

966 column: str, 

967 raster: xr.DataArray, 

968 resolution: float, 

969 method: Union[str, Callable], 

970 chunksize: int = 10_000, 

971) -> pd.DataFrame: 

972 """ 

973 Compute a zonal aggregate of raster data for polygon geometries, e.g. a mean, 

974 mode, or percentile. 

975 

976 Parameters 

977 ---------- 

978 path : str or pathlib.Path 

979 path to OGR supported vector file (e.g. a shapefile). Defines zones 

980 of aggregation. 

981 column : str 

982 column name of path, integer IDs defining zones. 

983 raster : xarray.DataArray 

984 Raster data from which to sample and aggregate data 

985 resolution : float 

986 cellsize at which the rasterization of polygons and sampling occurs 

987 method : Union[str, Callable] 

988 Aggregation method. 

989 Anything that is acceptable by a pandas groupby aggregate: 

990 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html 

991 chunksize : int, optional 

992 The size of the chunksize. Used for both x and y dimension. 

993 

994 Returns 

995 ------- 

996 zonal_aggregates : pandas.DataFrame 

997 

998 Examples 

999 -------- 

1000 

1001 To compute the mean surface level at polygons of water bodies: 

1002 

1003 >>> import imod 

1004 >>> surface_level = imod.rasterio.open("surface_level.tif") 

1005 >>> df = imod.prepare.spatial.zonal_aggregate_raster( 

1006 >>> path="water-bodies.shp", 

1007 >>> column="id", 

1008 >>> raster=surface_level, 

1009 >>> resolution=1.0, 

1010 >>> method="mean", 

1011 >>> ) 

1012 

1013 For some functions, like the mode, a function should be passed instead: 

1014 

1015 >>> import pandas as pd 

1016 >>> df = imod.prepare.spatial.zonal_aggregate_raster( 

1017 >>> path="water-bodies.shp", 

1018 >>> column="id", 

1019 >>> raster=surface_level, 

1020 >>> resolution=1.0, 

1021 >>> method=pd.Series.mode, 

1022 >>> ) 

1023 """ 

1024 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(raster) 

1025 if not imod.util.spatial.is_divisor(dx, resolution): 

1026 raise ValueError("resolution is not an (integer) divisor of dx") 

1027 if not imod.util.spatial.is_divisor(dy, resolution): 

1028 raise ValueError("resolution is not an (integer) divisor of dy") 

1029 

1030 without_chunks = (raster.chunks is None) or ( 

1031 all(length == 1 for length in map(len, raster.chunks)) 

1032 ) 

1033 if without_chunks: 

1034 raster = raster.compute() 

1035 

1036 raster_chunks, _, _ = _create_chunks(raster, resolution, chunksize) 

1037 collection = [ 

1038 dask.delayed(_zonal_aggregate_raster)(path, column, resolution, chunk, method) 

1039 for chunk in raster_chunks 

1040 ] 

1041 result = dask.compute(collection)[0] 

1042 return pd.concat(result) 

1043 

1044 

1045def zonal_aggregate_polygons( 

1046 path_a: Union[pathlib.Path, str], 

1047 path_b: Union[pathlib.Path, str], 

1048 column_a: str, 

1049 column_b: str, 

1050 like: xr.DataArray, 

1051 resolution: float, 

1052 method: Union[str, Callable], 

1053 chunksize: int = 10_000, 

1054) -> pd.DataFrame: 

1055 """ 

1056 Compute a zonal aggregate of polygon data for (other) polygon geometries, 

1057 e.g. a mean, mode, or percentile. 

1058 

1059 Parameters 

1060 ---------- 

1061 path_a : str or pathlib.Path 

1062 path to OGR supported vector file (e.g. a shapefile) 

1063 path_b : str or pathlib.Path 

1064 path to OGR supported vector file (e.g. a shapefile) 

1065 column_a : str 

1066 column name of path_a. Defines zones of aggregation. 

1067 column_b : str 

1068 column name of path_b. Data to aggregate. 

1069 like : xarray.DataArray with dims ("y", "x") 

1070 Example DataArray of where the cells will be located. Used only for its 

1071 x and y coordinates. 

1072 resolution : float 

1073 cellsize at which the rasterization, sampling, and area measurement occur. 

1074 method: Union[str, Callable] 

1075 Aggregation method. 

1076 Anything that is acceptable by a pandas groupby aggregate: 

1077 https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate.html 

1078 chunksize : int, optional 

1079 The size of the chunksize. Used for both x and y dimension. 

1080 

1081 Returns 

1082 ------- 

1083 zonal_aggregates: pandas.DataFrame 

1084 """ 

1085 dx, _, _, dy, _, _ = imod.util.spatial.spatial_reference(like) 

1086 if not imod.util.spatial.is_divisor(dx, resolution): 

1087 raise ValueError("resolution is not an (integer) divisor of dx") 

1088 if not imod.util.spatial.is_divisor(dy, resolution): 

1089 raise ValueError("resolution is not an (integer) divisor of dy") 

1090 

1091 like_chunks, _, _ = _create_chunks(like, resolution, chunksize) 

1092 collection = [ 

1093 dask.delayed(_zonal_aggregate_polygons)( 

1094 path_a, path_b, column_a, column_b, resolution, chunk, method 

1095 ) 

1096 for chunk in like_chunks 

1097 ] 

1098 result = dask.compute(collection)[0] 

1099 return pd.concat(result)