Coverage for /var/devmt/py/utils4_1.7.0/utils4/dfdiff.py: 100%

65 statements  

« prev     ^ index     » next       coverage.py v7.6.9, created at 2024-12-21 17:18 +0000

1#!/usr/bin/env python3 

2# -*- coding: utf-8 -*- 

3""" 

4:Purpose: This module provides DataFrame differencing logic. 

5 

6 The caller creates an instance which accepts the two DataFrames 

7 to be compared as the arguments. When the 

8 :meth:`~DataFrameDiff.diff` method is called, a list of columns 

9 containing value mismatches is compiled. Then, the list of column 

10 mismatches is iterated with each value in the column being 

11 compared. All value mismatches are reported to the terminal. 

12 

13:Platform: Linux/Windows | Python 3.7+ 

14:Developer: J Berendt 

15:Email: support@s3dev.uk 

16 

17:Note: It's worth noting that current functionality **does not 

18 check data types**, unlike the pandas ``pd.DataFrame.equals()`` 

19 method. This functionality may be added in a future release. 

20 

21:Example: 

22 

23 Short example for differencing two DataFrames:: 

24 

25 >>> from utils4 import dfdiff 

26 

27 >>> d = dfdiff.DataFrameDiff(df_source, df_test) 

28 >>> d.diff() 

29 

30""" 

31# pylint: disable=wrong-import-order 

32 

33import pandas as pd 

34from itertools import zip_longest 

35try: 

36 from .user_interface import ui 

37except ImportError: # pragma: nocover 

38 from user_interface import ui 

39 

40 

41class _Messages: 

42 """This private class handles the messaging for DataFrame differencing.""" 

43 

44 _FMT = '{:<10}\t{:<10}\t{:<25}\t{:<25}' 

45 

46 @staticmethod 

47 def column_mismatches(columns: list): 

48 """List columns with mismatches. 

49 

50 Args: 

51 columns (list): A list of columns containing mismatches. 

52 

53 """ 

54 # pylint: disable=consider-using-f-string 

55 ui.print_('\nColumn mismatches:', fore='cyan', style='normal') 

56 print(*map('- {}'.format, columns), sep='\n') 

57 

58 @staticmethod 

59 def column_mismatches_none(): 

60 """Print message for no column mismatches.""" 

61 ui.print_('\nNo mismatches for this set.', fore='green') 

62 

63 def data_mismatches(self, column: str, mismatches: list): 

64 """Print the data mismatches. 

65 

66 Args: 

67 column (str): Name of the column being analysed. 

68 mismatches (list): A list of tuples containing data mismatches, 

69 as:: 

70 

71 [(0, 0, 1, 2), (1, 1, 3, 4)] 

72 

73 """ 

74 ui.print_(f'Data mismatches for column: {column}', fore='yellow') 

75 print(self._FMT.format('SrcRow', 'TstRow', 'SrcValue', 'TstValue')) 

76 print('-'*92) 

77 print(*(self._FMT.format(*m) for m in mismatches), sep='\n') 

78 print() 

79 

80 @staticmethod 

81 def data_mismatches_none(column: str): 

82 """Print message for no data mismatches. 

83 

84 Args: 

85 column (str): Name of the column being analysed. 

86 

87 """ 

88 ui.print_(f'\nNo data mismatches for {column}', fore='green') 

89 

90 

91class DataFrameDiff: 

92 """Test and report differences in two pandas DataFrames. 

93 

94 Args: 

95 df_source (pd.DataFrame): DataFrame containing **source** data. 

96 This dataset holds the **expected** results. 

97 df_test (pd.DataFrame): DataFrame containing the **test** data. 

98 This dataset is compared against the 'expected' dataset. 

99 

100 """ 

101 

102 def __init__(self, df_source: pd.DataFrame, df_test: pd.DataFrame): 

103 """DataFrame difference class initialiser.""" 

104 self._df_s = df_source 

105 self._df_t = df_test 

106 self._col_mismatches = [] 

107 self._msg = _Messages() 

108 

109 def diff(self): 

110 """Compare DataFrames and report the differences.""" 

111 self._get_mismatches() 

112 self._report() 

113 

114 def _get_mismatches(self): 

115 """Build a list of columns with mismatches.""" 

116 # Add column to list if it contains a mismatch. 

117 mis = [col for col in self._df_s.columns 

118 if not self._df_t[col].equals(self._df_s[col])] 

119 if mis: 

120 self._msg.column_mismatches(columns=self._col_mismatches) 

121 else: 

122 self._msg.column_mismatches_none() 

123 self._col_mismatches = mis 

124 

125 def _report(self) -> None: 

126 """Compare values in mismatched columns and report.""" 

127 for col in self._col_mismatches: 

128 mismatches = [] 

129 # Zip source and test datasets. 

130 for (idx1, row1), (idx2, row2) in zip_longest(self._df_s.iterrows(), 

131 self._df_t.iterrows(), 

132 fillvalue=(None, None)): 

133 # Catch if a row exists in one dataset and not the other. 

134 if any([row1 is None, row2 is None]): 

135 idx1 = idx1 if idx1 is not None else idx2 

136 idx2 = idx2 if idx2 is not None else idx1 

137 val1 = str(row1[col]) if row1 is not None else 'no value (source)' 

138 val2 = str(row2[col]) if row2 is not None else 'no value (test)' 

139 # Convert datetimes to string for compare. 

140 elif isinstance(row2[col], pd.Timestamp): 

141 val1 = str(row1[col]) 

142 val2 = str(row2[col]) 

143 # Enable compare of nan types. 

144 elif any([pd.isna(row1[col]), pd.isna(row2[col])]): 

145 # Convert mismatched nan/NaT types to 'NaT' string. 

146 if all([pd.isna(row1[col]), row2[col] is pd.NaT]): 

147 val1 = 'NaT' 

148 val2 = 'NaT' 

149 else: 

150 val1 = str(row1[col]) 

151 val2 = str(row2[col]) 

152 # Reformat floats to align. 

153 elif any([isinstance(row1[col], float), isinstance(row2[col], float)]): 

154 val1 = round(float(row1[col]), 5) 

155 val2 = round(float(row2[col]), 5) 

156 else: 

157 # Convert to string for each compare. 

158 val1 = str(row1[col]) 

159 val2 = str(row2[col]) 

160 # Do the compare. 

161 if val1 != val2: 

162 # Add any mismatches to a list for reporting. 

163 mismatches.append((idx1, idx2, val1, val2)) 

164 if mismatches: 

165 self._msg.data_mismatches(column=col, mismatches=mismatches) 

166 else: 

167 self._msg.data_mismatches_none(column=col)