Coverage for src/dqx_network_checks/checks.py: 50%
72 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-07-01 10:47 +0200
« prev ^ index » next coverage.py v7.9.1, created at 2025-07-01 10:47 +0200
1"""Data quality checks for IPv4 network validation.
3This module provides a comprehensive set of data quality checks for IPv4 address
4and network validation using PySpark DataFrames. It includes checks for various
5IPv4 address types (loopback, multicast, private, global) and network operations.
7The module integrates with the Databricks Data Quality Framework (DQX) to provide
8row-level validation rules that can be applied to DataFrame columns.
10Example:
11 ```python
12 from dqx_network_checks import is_ipv4_address, is_ipv4_private_address
14 # Apply checks to a DataFrame
15 df = spark.read.csv("network_data.csv")
16 result = df.filter(is_ipv4_address("ip_column"))
17 private_ips = df.filter(is_ipv4_private_address("ip_column"))
18 ```
20All validation functions return PySpark Column objects that can be used in
21DataFrame operations for filtering, aggregating, or creating derived columns.
22"""
24import ipaddress
26import pyspark.sql.functions as f
27import pyspark.sql.types as t
28from databricks.labs.dqx.check_funcs import make_condition
29from databricks.labs.dqx.rule import register_rule
30from pyspark.sql import Column
32from dqx_network_checks.validators import (
33 validate_global_ipv4_address,
34 validate_ipv4_address,
35 validate_ipv4_network,
36 validate_loopback_ipv4_address,
37 validate_multicast_ipv4_address,
38 validate_network_contains_ipv4_address,
39 validate_private_ipv4_address,
40)
43@register_rule("row")
44def is_ipv4_address(column: str | Column) -> Column:
45 """Validates that a column contains valid IPv4 addresses.
47 This function creates a data quality check that validates whether each value
48 in the specified column is a valid IPv4 address format (e.g., "192.168.1.1").
50 Args:
51 column: The name of the column to validate. Must be a string containing
52 the column name as it appears in the DataFrame.
54 Returns:
55 A PySpark Column with a boolean value representing the validation condition. When used
56 in a filter operation, it will return rows where the column contains
57 valid IPv4 addresses.
59 Example:
60 ```python
61 # Filter DataFrame to only include rows with valid IPv4 addresses
62 valid_ips = df.filter(is_ipv4_address("ip_column"))
64 # Use in a select statement to create a boolean column
65 df_with_validation = df.select(
66 "*",
67 is_ipv4_address("ip_column").alias("is_valid_ip")
68 )
69 ```
71 Note:
72 This check validates the format only. It does not verify if the address
73 is reachable or currently in use.
74 """
75 col = _as_column(column)
76 error_message = f"Column `{column}` is not a valid IPv4 address"
77 return make_condition(~is_ipv4_address_udf(col), error_message, "is_ipv4_address")
80@register_rule("row")
81def is_ipv4_loopback_address(column: str | Column) -> Column:
82 """Validates that a column contains IPv4 loopback addresses.
84 This function creates a data quality check that validates whether each value
85 in the specified column is a valid IPv4 loopback address (127.0.0.0/8 range).
87 Args:
88 column: The name of the column to validate. Must be a string containing
89 the column name as it appears in the DataFrame.
91 Returns:
92 A PySpark Column object representing the validation condition. When used
93 in a filter operation, it will return rows where the column contains
94 valid IPv4 loopback addresses.
96 Example:
97 ```python
98 # Filter DataFrame to only include loopback addresses
99 loopback_ips = df.filter(is_ipv4_loopback_address("ip_column"))
100 ```
102 Note:
103 Loopback addresses are in the range 127.0.0.0/8 and are used for
104 internal communication within a host.
105 """
106 col = _as_column(column)
107 error_message = f"Column `{column}` is not a valid IPv4 loopback address"
108 return make_condition(
109 ~is_ipv4_loopback_address_udf(col),
110 error_message,
111 "is_ipv4_loopback_address",
112 )
115@register_rule("row")
116def is_ipv4_multicast_address(column: str | Column) -> Column:
117 """Validates that a column contains IPv4 multicast addresses.
119 This function creates a data quality check that validates whether each value
120 in the specified column is a valid IPv4 multicast address (224.0.0.0/4 range).
122 Args:
123 column: The name of the column to validate. Must be a string containing
124 the column name as it appears in the DataFrame.
126 Returns:
127 A PySpark Column object representing the validation condition. When used
128 in a filter operation, it will return rows where the column contains
129 valid IPv4 multicast addresses.
131 Example:
132 ```python
133 # Filter DataFrame to only include multicast addresses
134 multicast_ips = df.filter(is_ipv4_multicast_address("ip_column"))
135 ```
137 Note:
138 Multicast addresses are in the range 224.0.0.0/4 and are used for
139 one-to-many communication.
140 """
141 col = _as_column(column)
142 error_message = f"Column `{column}` is not a valid IPv4 multicast address"
143 return make_condition(
144 ~is_ipv4_multicast_address_udf(col),
145 error_message,
146 "is_ipv4_multicast_address",
147 )
150@register_rule("row")
151def is_ipv4_private_address(column: str | Column) -> Column:
152 """Validates that a column contains IPv4 private addresses.
154 This function creates a data quality check that validates whether each value
155 in the specified column is a valid IPv4 private address. Private address
156 ranges include:
157 - 10.0.0.0/8
158 - 172.16.0.0/12
159 - 192.168.0.0/16
161 Args:
162 column: The name of the column to validate. Must be a string containing
163 the column name as it appears in the DataFrame.
165 Returns:
166 A PySpark Column object representing the validation condition. When used
167 in a filter operation, it will return rows where the column contains
168 valid IPv4 private addresses.
170 Example:
171 ```python
172 # Filter DataFrame to only include private addresses
173 private_ips = df.filter(is_ipv4_private_address("ip_column"))
174 ```
176 Note:
177 Private addresses are reserved for use within private networks and are
178 not routable on the public internet.
179 """
180 if isinstance(column, str):
181 col = f.col(column)
182 else:
183 col = column
184 error_message = f"Column `{column}` is not a valid IPv4 private address"
185 return make_condition(
186 ~is_ipv4_private_address_udf(col),
187 error_message,
188 "is_ipv4_private_address",
189 )
192@register_rule("row")
193def is_ipv4_global_address(column: str | Column) -> Column:
194 """Validates that a column contains IPv4 global (public) addresses.
196 This function creates a data quality check that validates whether each value
197 in the specified column is a valid IPv4 global address. Global addresses
198 are public IP addresses that are routable on the internet.
200 Args:
201 column: The name of the column to validate. Must be a string containing
202 the column name as it appears in the DataFrame.
204 Returns:
205 A PySpark Column object representing the validation condition. When used
206 in a filter operation, it will return rows where the column contains
207 valid IPv4 global addresses.
209 Example:
210 ```python
211 # Filter DataFrame to only include global addresses
212 global_ips = df.filter(is_ipv4_global_address("ip_column"))
213 ```
215 Note:
216 Global addresses exclude private, loopback, multicast, and other
217 reserved address ranges.
218 """
219 col = _as_column(column)
220 error_message = f"Column `{column}` is not a valid IPv4 global address"
221 return make_condition(
222 ~is_ipv4_global_address_udf(col),
223 error_message,
224 "is_ipv4_global_address",
225 )
228@register_rule("row")
229def is_ipv4_network(column: str | Column) -> Column:
230 """Validates that a column contains valid IPv4 network addresses.
232 This function creates a data quality check that validates whether each value
233 in the specified column is a valid IPv4 network address in CIDR notation
234 (e.g., "192.168.1.0/24").
236 Args:
237 column: The name of the column to validate. Must be a string containing
238 the column name as it appears in the DataFrame.
240 Returns:
241 A PySpark Column object representing the validation condition. When used
242 in a filter operation, it will return rows where the column contains
243 valid IPv4 network addresses.
245 Example:
246 ```python
247 # Filter DataFrame to only include valid network addresses
248 valid_networks = df.filter(is_ipv4_network("network_column"))
249 ```
251 Note:
252 Network addresses must be in CIDR notation with a valid subnet mask
253 (e.g., /8, /16, /24, /32).
254 """
255 col = _as_column(column)
256 error_message = f"Column `{column}` is not a valid IPv4 network"
257 return make_condition(
258 ~is_ipv4_network_udf(col),
259 error_message,
260 "is_ipv4_network",
261 )
264@register_rule("row")
265def is_ipv4_network_contains_address(
266 column: str | Column, network: str | ipaddress.IPv4Network
267) -> Column:
268 """Validates that addresses in a column are contained within a specified network.
270 This function creates a data quality check that validates whether each value
271 in the specified column is an IPv4 address that falls within the given
272 network range.
274 Args:
275 column: The name of the column containing IPv4 addresses to validate.
276 Must be a string containing the column name as it appears in the
277 DataFrame.
278 network: The IPv4 network in CIDR notation (e.g., "192.168.1.0/24")
279 that should contain the addresses in the specified column.
281 Returns:
282 A PySpark Column object representing the validation condition. When used
283 in a filter operation, it will return rows where the column contains
284 IPv4 addresses that are within the specified network range.
286 Example:
287 ```python
288 # Filter DataFrame to only include addresses in the 192.168.1.0/24 network
289 network_ips = df.filter(
290 is_ipv4_network_contains_address("ip_column", "192.168.1.0/24")
291 )
292 ```
294 Note:
295 The network parameter must be a valid IPv4 network in CIDR notation.
296 Addresses in the column must be valid IPv4 addresses.
297 """
298 col = _as_column(column)
299 if isinstance(network, str):
300 network_parsed = str(ipaddress.IPv4Network(network))
301 else:
302 network_parsed = str(network)
304 error_message = f"Network `{network_parsed}` does not contain address `{col}`"
305 return make_condition(
306 ~is_ipv4_network_contains_address_udf(network_parsed, col),
307 error_message,
308 "is_ipv4_network_contains_address",
309 )
312@f.udf(t.BooleanType())
313def is_ipv4_address_udf(address: str) -> bool:
314 """User-defined function to validate IPv4 address format.
316 Args:
317 address: A string representing an IPv4 address to validate.
319 Returns:
320 True if the address is a valid IPv4 address, False otherwise.
322 Note:
323 This is an internal UDF used by the main validation functions.
324 It should not be called directly in most cases.
325 """
326 return validate_ipv4_address(address)
329@f.udf(t.BooleanType())
330def is_ipv4_loopback_address_udf(address: str) -> bool:
331 """User-defined function to validate IPv4 loopback address.
333 Args:
334 address: A string representing an IPv4 address to validate.
336 Returns:
337 True if the address is a valid IPv4 loopback address, False otherwise.
339 Note:
340 This is an internal UDF used by the main validation functions.
341 It should not be called directly in most cases.
342 """
343 return validate_loopback_ipv4_address(address)
346@f.udf(t.BooleanType())
347def is_ipv4_multicast_address_udf(address: str) -> bool:
348 """User-defined function to validate IPv4 multicast address.
350 Args:
351 address: A string representing an IPv4 address to validate.
353 Returns:
354 True if the address is a valid IPv4 multicast address, False otherwise.
356 Note:
357 This is an internal UDF used by the main validation functions.
358 It should not be called directly in most cases.
359 """
360 return validate_multicast_ipv4_address(address)
363@f.udf(t.BooleanType())
364def is_ipv4_private_address_udf(address: str) -> bool:
365 """User-defined function to validate IPv4 private address.
367 Args:
368 address: A string representing an IPv4 address to validate.
370 Returns:
371 True if the address is a valid IPv4 private address, False otherwise.
373 Note:
374 This is an internal UDF used by the main validation functions.
375 It should not be called directly in most cases.
376 """
377 return validate_private_ipv4_address(address)
380@f.udf(t.BooleanType())
381def is_ipv4_global_address_udf(address: str) -> bool:
382 """User-defined function to validate IPv4 global address.
384 Args:
385 address: A string representing an IPv4 address to validate.
387 Returns:
388 True if the address is a valid IPv4 global address, False otherwise.
390 Note:
391 This is an internal UDF used by the main validation functions.
392 It should not be called directly in most cases.
393 """
394 return validate_global_ipv4_address(address)
397@f.udf(t.BooleanType())
398def is_ipv4_network_udf(network: str) -> bool:
399 """User-defined function to validate IPv4 network format.
401 Args:
402 network: A string representing an IPv4 network in CIDR notation to validate.
404 Returns:
405 True if the network is a valid IPv4 network, False otherwise.
407 Note:
408 This is an internal UDF used by the main validation functions.
409 It should not be called directly in most cases.
410 """
411 return validate_ipv4_network(network)
414@f.udf(t.BooleanType())
415def is_ipv4_network_contains_address_udf(network: str, address: str) -> bool:
416 """User-defined function to check if an address is contained within a network.
418 Args:
419 network: A string representing an IPv4 network in CIDR notation.
420 address: A string representing an IPv4 address to check.
422 Returns:
423 True if the address is contained within the network, False otherwise.
425 Note:
426 This is an internal UDF used by the main validation functions.
427 It should not be called directly in most cases.
428 """
429 return validate_network_contains_ipv4_address(network, address)
432def _as_column(column: str | Column) -> Column:
433 if isinstance(column, str):
434 return f.col(column)
435 else:
436 return column