Coverage for src/dqx_network_checks/checks.py: 50%

72 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-07-01 10:47 +0200

1"""Data quality checks for IPv4 network validation. 

2 

3This module provides a comprehensive set of data quality checks for IPv4 address 

4and network validation using PySpark DataFrames. It includes checks for various 

5IPv4 address types (loopback, multicast, private, global) and network operations. 

6 

7The module integrates with the Databricks Data Quality Framework (DQX) to provide 

8row-level validation rules that can be applied to DataFrame columns. 

9 

10Example: 

11 ```python 

12 from dqx_network_checks import is_ipv4_address, is_ipv4_private_address 

13 

14 # Apply checks to a DataFrame 

15 df = spark.read.csv("network_data.csv") 

16 result = df.filter(is_ipv4_address("ip_column")) 

17 private_ips = df.filter(is_ipv4_private_address("ip_column")) 

18 ``` 

19 

20All validation functions return PySpark Column objects that can be used in 

21DataFrame operations for filtering, aggregating, or creating derived columns. 

22""" 

23 

24import ipaddress 

25 

26import pyspark.sql.functions as f 

27import pyspark.sql.types as t 

28from databricks.labs.dqx.check_funcs import make_condition 

29from databricks.labs.dqx.rule import register_rule 

30from pyspark.sql import Column 

31 

32from dqx_network_checks.validators import ( 

33 validate_global_ipv4_address, 

34 validate_ipv4_address, 

35 validate_ipv4_network, 

36 validate_loopback_ipv4_address, 

37 validate_multicast_ipv4_address, 

38 validate_network_contains_ipv4_address, 

39 validate_private_ipv4_address, 

40) 

41 

42 

43@register_rule("row") 

44def is_ipv4_address(column: str | Column) -> Column: 

45 """Validates that a column contains valid IPv4 addresses. 

46 

47 This function creates a data quality check that validates whether each value 

48 in the specified column is a valid IPv4 address format (e.g., "192.168.1.1"). 

49 

50 Args: 

51 column: The name of the column to validate. Must be a string containing 

52 the column name as it appears in the DataFrame. 

53 

54 Returns: 

55 A PySpark Column with a boolean value representing the validation condition. When used 

56 in a filter operation, it will return rows where the column contains 

57 valid IPv4 addresses. 

58 

59 Example: 

60 ```python 

61 # Filter DataFrame to only include rows with valid IPv4 addresses 

62 valid_ips = df.filter(is_ipv4_address("ip_column")) 

63 

64 # Use in a select statement to create a boolean column 

65 df_with_validation = df.select( 

66 "*", 

67 is_ipv4_address("ip_column").alias("is_valid_ip") 

68 ) 

69 ``` 

70 

71 Note: 

72 This check validates the format only. It does not verify if the address 

73 is reachable or currently in use. 

74 """ 

75 col = _as_column(column) 

76 error_message = f"Column `{column}` is not a valid IPv4 address" 

77 return make_condition(~is_ipv4_address_udf(col), error_message, "is_ipv4_address") 

78 

79 

80@register_rule("row") 

81def is_ipv4_loopback_address(column: str | Column) -> Column: 

82 """Validates that a column contains IPv4 loopback addresses. 

83 

84 This function creates a data quality check that validates whether each value 

85 in the specified column is a valid IPv4 loopback address (127.0.0.0/8 range). 

86 

87 Args: 

88 column: The name of the column to validate. Must be a string containing 

89 the column name as it appears in the DataFrame. 

90 

91 Returns: 

92 A PySpark Column object representing the validation condition. When used 

93 in a filter operation, it will return rows where the column contains 

94 valid IPv4 loopback addresses. 

95 

96 Example: 

97 ```python 

98 # Filter DataFrame to only include loopback addresses 

99 loopback_ips = df.filter(is_ipv4_loopback_address("ip_column")) 

100 ``` 

101 

102 Note: 

103 Loopback addresses are in the range 127.0.0.0/8 and are used for 

104 internal communication within a host. 

105 """ 

106 col = _as_column(column) 

107 error_message = f"Column `{column}` is not a valid IPv4 loopback address" 

108 return make_condition( 

109 ~is_ipv4_loopback_address_udf(col), 

110 error_message, 

111 "is_ipv4_loopback_address", 

112 ) 

113 

114 

115@register_rule("row") 

116def is_ipv4_multicast_address(column: str | Column) -> Column: 

117 """Validates that a column contains IPv4 multicast addresses. 

118 

119 This function creates a data quality check that validates whether each value 

120 in the specified column is a valid IPv4 multicast address (224.0.0.0/4 range). 

121 

122 Args: 

123 column: The name of the column to validate. Must be a string containing 

124 the column name as it appears in the DataFrame. 

125 

126 Returns: 

127 A PySpark Column object representing the validation condition. When used 

128 in a filter operation, it will return rows where the column contains 

129 valid IPv4 multicast addresses. 

130 

131 Example: 

132 ```python 

133 # Filter DataFrame to only include multicast addresses 

134 multicast_ips = df.filter(is_ipv4_multicast_address("ip_column")) 

135 ``` 

136 

137 Note: 

138 Multicast addresses are in the range 224.0.0.0/4 and are used for 

139 one-to-many communication. 

140 """ 

141 col = _as_column(column) 

142 error_message = f"Column `{column}` is not a valid IPv4 multicast address" 

143 return make_condition( 

144 ~is_ipv4_multicast_address_udf(col), 

145 error_message, 

146 "is_ipv4_multicast_address", 

147 ) 

148 

149 

150@register_rule("row") 

151def is_ipv4_private_address(column: str | Column) -> Column: 

152 """Validates that a column contains IPv4 private addresses. 

153 

154 This function creates a data quality check that validates whether each value 

155 in the specified column is a valid IPv4 private address. Private address 

156 ranges include: 

157 - 10.0.0.0/8 

158 - 172.16.0.0/12 

159 - 192.168.0.0/16 

160 

161 Args: 

162 column: The name of the column to validate. Must be a string containing 

163 the column name as it appears in the DataFrame. 

164 

165 Returns: 

166 A PySpark Column object representing the validation condition. When used 

167 in a filter operation, it will return rows where the column contains 

168 valid IPv4 private addresses. 

169 

170 Example: 

171 ```python 

172 # Filter DataFrame to only include private addresses 

173 private_ips = df.filter(is_ipv4_private_address("ip_column")) 

174 ``` 

175 

176 Note: 

177 Private addresses are reserved for use within private networks and are 

178 not routable on the public internet. 

179 """ 

180 if isinstance(column, str): 

181 col = f.col(column) 

182 else: 

183 col = column 

184 error_message = f"Column `{column}` is not a valid IPv4 private address" 

185 return make_condition( 

186 ~is_ipv4_private_address_udf(col), 

187 error_message, 

188 "is_ipv4_private_address", 

189 ) 

190 

191 

192@register_rule("row") 

193def is_ipv4_global_address(column: str | Column) -> Column: 

194 """Validates that a column contains IPv4 global (public) addresses. 

195 

196 This function creates a data quality check that validates whether each value 

197 in the specified column is a valid IPv4 global address. Global addresses 

198 are public IP addresses that are routable on the internet. 

199 

200 Args: 

201 column: The name of the column to validate. Must be a string containing 

202 the column name as it appears in the DataFrame. 

203 

204 Returns: 

205 A PySpark Column object representing the validation condition. When used 

206 in a filter operation, it will return rows where the column contains 

207 valid IPv4 global addresses. 

208 

209 Example: 

210 ```python 

211 # Filter DataFrame to only include global addresses 

212 global_ips = df.filter(is_ipv4_global_address("ip_column")) 

213 ``` 

214 

215 Note: 

216 Global addresses exclude private, loopback, multicast, and other 

217 reserved address ranges. 

218 """ 

219 col = _as_column(column) 

220 error_message = f"Column `{column}` is not a valid IPv4 global address" 

221 return make_condition( 

222 ~is_ipv4_global_address_udf(col), 

223 error_message, 

224 "is_ipv4_global_address", 

225 ) 

226 

227 

228@register_rule("row") 

229def is_ipv4_network(column: str | Column) -> Column: 

230 """Validates that a column contains valid IPv4 network addresses. 

231 

232 This function creates a data quality check that validates whether each value 

233 in the specified column is a valid IPv4 network address in CIDR notation 

234 (e.g., "192.168.1.0/24"). 

235 

236 Args: 

237 column: The name of the column to validate. Must be a string containing 

238 the column name as it appears in the DataFrame. 

239 

240 Returns: 

241 A PySpark Column object representing the validation condition. When used 

242 in a filter operation, it will return rows where the column contains 

243 valid IPv4 network addresses. 

244 

245 Example: 

246 ```python 

247 # Filter DataFrame to only include valid network addresses 

248 valid_networks = df.filter(is_ipv4_network("network_column")) 

249 ``` 

250 

251 Note: 

252 Network addresses must be in CIDR notation with a valid subnet mask 

253 (e.g., /8, /16, /24, /32). 

254 """ 

255 col = _as_column(column) 

256 error_message = f"Column `{column}` is not a valid IPv4 network" 

257 return make_condition( 

258 ~is_ipv4_network_udf(col), 

259 error_message, 

260 "is_ipv4_network", 

261 ) 

262 

263 

264@register_rule("row") 

265def is_ipv4_network_contains_address( 

266 column: str | Column, network: str | ipaddress.IPv4Network 

267) -> Column: 

268 """Validates that addresses in a column are contained within a specified network. 

269 

270 This function creates a data quality check that validates whether each value 

271 in the specified column is an IPv4 address that falls within the given 

272 network range. 

273 

274 Args: 

275 column: The name of the column containing IPv4 addresses to validate. 

276 Must be a string containing the column name as it appears in the 

277 DataFrame. 

278 network: The IPv4 network in CIDR notation (e.g., "192.168.1.0/24") 

279 that should contain the addresses in the specified column. 

280 

281 Returns: 

282 A PySpark Column object representing the validation condition. When used 

283 in a filter operation, it will return rows where the column contains 

284 IPv4 addresses that are within the specified network range. 

285 

286 Example: 

287 ```python 

288 # Filter DataFrame to only include addresses in the 192.168.1.0/24 network 

289 network_ips = df.filter( 

290 is_ipv4_network_contains_address("ip_column", "192.168.1.0/24") 

291 ) 

292 ``` 

293 

294 Note: 

295 The network parameter must be a valid IPv4 network in CIDR notation. 

296 Addresses in the column must be valid IPv4 addresses. 

297 """ 

298 col = _as_column(column) 

299 if isinstance(network, str): 

300 network_parsed = str(ipaddress.IPv4Network(network)) 

301 else: 

302 network_parsed = str(network) 

303 

304 error_message = f"Network `{network_parsed}` does not contain address `{col}`" 

305 return make_condition( 

306 ~is_ipv4_network_contains_address_udf(network_parsed, col), 

307 error_message, 

308 "is_ipv4_network_contains_address", 

309 ) 

310 

311 

312@f.udf(t.BooleanType()) 

313def is_ipv4_address_udf(address: str) -> bool: 

314 """User-defined function to validate IPv4 address format. 

315 

316 Args: 

317 address: A string representing an IPv4 address to validate. 

318 

319 Returns: 

320 True if the address is a valid IPv4 address, False otherwise. 

321 

322 Note: 

323 This is an internal UDF used by the main validation functions. 

324 It should not be called directly in most cases. 

325 """ 

326 return validate_ipv4_address(address) 

327 

328 

329@f.udf(t.BooleanType()) 

330def is_ipv4_loopback_address_udf(address: str) -> bool: 

331 """User-defined function to validate IPv4 loopback address. 

332 

333 Args: 

334 address: A string representing an IPv4 address to validate. 

335 

336 Returns: 

337 True if the address is a valid IPv4 loopback address, False otherwise. 

338 

339 Note: 

340 This is an internal UDF used by the main validation functions. 

341 It should not be called directly in most cases. 

342 """ 

343 return validate_loopback_ipv4_address(address) 

344 

345 

346@f.udf(t.BooleanType()) 

347def is_ipv4_multicast_address_udf(address: str) -> bool: 

348 """User-defined function to validate IPv4 multicast address. 

349 

350 Args: 

351 address: A string representing an IPv4 address to validate. 

352 

353 Returns: 

354 True if the address is a valid IPv4 multicast address, False otherwise. 

355 

356 Note: 

357 This is an internal UDF used by the main validation functions. 

358 It should not be called directly in most cases. 

359 """ 

360 return validate_multicast_ipv4_address(address) 

361 

362 

363@f.udf(t.BooleanType()) 

364def is_ipv4_private_address_udf(address: str) -> bool: 

365 """User-defined function to validate IPv4 private address. 

366 

367 Args: 

368 address: A string representing an IPv4 address to validate. 

369 

370 Returns: 

371 True if the address is a valid IPv4 private address, False otherwise. 

372 

373 Note: 

374 This is an internal UDF used by the main validation functions. 

375 It should not be called directly in most cases. 

376 """ 

377 return validate_private_ipv4_address(address) 

378 

379 

380@f.udf(t.BooleanType()) 

381def is_ipv4_global_address_udf(address: str) -> bool: 

382 """User-defined function to validate IPv4 global address. 

383 

384 Args: 

385 address: A string representing an IPv4 address to validate. 

386 

387 Returns: 

388 True if the address is a valid IPv4 global address, False otherwise. 

389 

390 Note: 

391 This is an internal UDF used by the main validation functions. 

392 It should not be called directly in most cases. 

393 """ 

394 return validate_global_ipv4_address(address) 

395 

396 

397@f.udf(t.BooleanType()) 

398def is_ipv4_network_udf(network: str) -> bool: 

399 """User-defined function to validate IPv4 network format. 

400 

401 Args: 

402 network: A string representing an IPv4 network in CIDR notation to validate. 

403 

404 Returns: 

405 True if the network is a valid IPv4 network, False otherwise. 

406 

407 Note: 

408 This is an internal UDF used by the main validation functions. 

409 It should not be called directly in most cases. 

410 """ 

411 return validate_ipv4_network(network) 

412 

413 

414@f.udf(t.BooleanType()) 

415def is_ipv4_network_contains_address_udf(network: str, address: str) -> bool: 

416 """User-defined function to check if an address is contained within a network. 

417 

418 Args: 

419 network: A string representing an IPv4 network in CIDR notation. 

420 address: A string representing an IPv4 address to check. 

421 

422 Returns: 

423 True if the address is contained within the network, False otherwise. 

424 

425 Note: 

426 This is an internal UDF used by the main validation functions. 

427 It should not be called directly in most cases. 

428 """ 

429 return validate_network_contains_ipv4_address(network, address) 

430 

431 

432def _as_column(column: str | Column) -> Column: 

433 if isinstance(column, str): 

434 return f.col(column) 

435 else: 

436 return column