Coverage for src/es_fieldusage/main.py: 83%

125 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-03-26 18:26 -0600

1"""Main app definition""" 

2 

3# pylint: disable=R0902 

4import typing as t 

5import logging 

6from es_client.helpers.config import get_client 

7from es_fieldusage.helpers import utils as u 

8from es_fieldusage.exceptions import ResultNotExpected, ValueMismatch 

9 

10 

11class FieldUsage: 

12 """Main Class""" 

13 

14 def __init__(self, configdict: t.Dict[str, t.Any], search_pattern: str) -> None: 

15 self.logger = logging.getLogger(__name__) 

16 self.client = get_client(configdict=configdict) 

17 self.usage_stats = {} 

18 self.indices_data = [] 

19 self.per_index_data = {} 

20 self.results_data = {} 

21 self.report_data = {} 

22 self.per_index_report_data = {} 

23 self.logger.info( 

24 f"Initializing FieldUsage with search pattern: {search_pattern}" 

25 ) 

26 self.get(search_pattern) 

27 

28 def get(self, search_pattern: str) -> None: 

29 """ 

30 Get ``raw_data`` from the field_usage_stats API for all indices in 

31 ``search_pattern`` Iterate over ``raw_data`` to build ``self.usage_stats`` 

32 """ 

33 try: 

34 field_usage = self.client.indices.field_usage_stats(index=search_pattern) 

35 except Exception as exc: 

36 self.logger.error(f"Unable to get field usage: {exc}") 

37 raise ResultNotExpected(f'Unable to get field usage: {exc}') from exc 

38 for index in list(field_usage.keys()): 

39 if index == '_shards': 

40 # Ignore this key as it is "global" 

41 continue 

42 self.usage_stats[index] = self.sum_index_stats(field_usage, index) 

43 

44 def get_field_mappings(self, idx: str) -> t.Dict[str, t.Any]: 

45 """ 

46 Return only the field mappings for index ``idx`` (not the entire index 

47 mapping) 

48 """ 

49 return dict( 

50 self.client.indices.get_mapping(index=idx)[idx]['mappings']['properties'] 

51 ) 

52 

53 def populate_values( 

54 self, idx: str, data: t.Dict[t.Any, t.Any] 

55 ) -> t.Dict[t.Any, t.Any]: 

56 """Now add the field usage values for idx to data and return the result""" 

57 for field in list(self.usage_stats[idx].keys()): 

58 if '.' in field: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 path = tuple(field.split('.')) 

60 else: 

61 path = field 

62 data[path] = self.usage_stats[idx][field] 

63 return data 

64 

65 def get_resultset(self, idx: str) -> t.Dict[t.Any, t.Any]: 

66 """Populate a result set with the fields in the index mapping""" 

67 result = {} 

68 if idx in self.usage_stats: 68 ↛ 71line 68 didn't jump to line 71 because the condition on line 68 was always true

69 allfields = u.convert_mapping(self.get_field_mappings(idx)) 

70 result = self.populate_values(idx, allfields) 

71 return result 

72 

73 def merge_results(self, idx: str) -> t.Dict[str, t.Any]: 

74 """Merge field usage data with index mapping""" 

75 retval = {} 

76 data = self.get_resultset(idx) 

77 for path in u.iterate_paths(data): 

78 value = u.get_value_from_path(data, path) 

79 key = '.'.join(u.detuple(path)) 

80 retval[key] = value 

81 return retval 

82 

83 def verify_single_index(self, index: t.Optional[str] = None) -> str: 

84 """ 

85 Ensure the index count is 1 for certain methods 

86 If no index provided, and only one is in self.indices, use that one 

87 """ 

88 if index is None: 

89 if isinstance(self.indices, list): 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 if len(self.indices) > 1: 

91 self.logger.warning(f"Too many indices found: {self.indices}") 

92 msg = ( 

93 f'Too many indices found. Indicate single index for result, ' 

94 f'or use results for all indices. Found: {self.indices}' 

95 ) 

96 raise ValueMismatch(msg) 

97 if len(self.indices) < 1: 

98 raise ValueMismatch('No indices found.') 

99 else: 

100 index = self.indices 

101 return index 

102 

103 @property 

104 def per_index_report(self) -> t.Dict[str, t.Any]: 

105 """Generate summary report data""" 

106 if not self.per_index_report_data: 106 ↛ 118line 106 didn't jump to line 118 because the condition on line 106 was always true

107 self.report_data['indices'] = self.indices 

108 self.report_data['field_count'] = len(self.results.keys()) 

109 for idx in list(self.results_by_index.keys()): 

110 self.per_index_report_data[idx] = {} 

111 self.per_index_report_data[idx]['accessed'] = {} 

112 self.per_index_report_data[idx]['unaccessed'] = {} 

113 for key, value in self.results_by_index[idx].items(): 

114 if value == 0: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 self.per_index_report_data[idx]['unaccessed'][key] = value 

116 else: 

117 self.per_index_report_data[idx]['accessed'][key] = value 

118 return self.per_index_report_data 

119 

120 @property 

121 def report(self) -> t.Dict[str, t.Any]: 

122 """Generate summary report data""" 

123 if not self.report_data: 123 ↛ 133line 123 didn't jump to line 133 because the condition on line 123 was always true

124 self.report_data['indices'] = self.indices 

125 self.report_data['field_count'] = len(self.results.keys()) 

126 self.report_data['accessed'] = {} 

127 self.report_data['unaccessed'] = {} 

128 for key, value in self.results.items(): 

129 if value == 0: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 self.report_data['unaccessed'][key] = value 

131 else: 

132 self.report_data['accessed'][key] = value 

133 return self.report_data 

134 

135 def result(self, idx: t.Optional[str] = None) -> t.Dict[str, t.Any]: 

136 """Return a single index result as a dictionary""" 

137 idx = self.verify_single_index(index=idx) 

138 return u.sort_by_value(self.merge_results(idx)) 

139 

140 @property 

141 def results_by_index(self) -> t.Dict[str, t.Dict[str, t.Any]]: 

142 """ 

143 Return all results as a dictionary, with the index name as the root key, 

144 and all stats for that index as the value, which is a dictionary generated 

145 by ``self.result()``. 

146 """ 

147 if not self.per_index_data: 

148 if not isinstance(self.indices, list): 148 ↛ 151line 148 didn't jump to line 151 because the condition on line 148 was always true

149 idx_list = [self.indices] 

150 else: 

151 idx_list = self.indices 

152 for idx in idx_list: 

153 self.per_index_data[idx] = self.result(idx=idx) 

154 return self.per_index_data 

155 

156 @property 

157 def results(self) -> t.Dict[str, t.Any]: 

158 """Return results for all indices found with values summed per mapping leaf""" 

159 # The summing re-orders things so it needs to be re-sorted 

160 if not self.results_data: 

161 _ = u.sort_by_value(u.sum_dict_values(self.results_by_index)) 

162 self.results_data = dict(_) 

163 return self.results_data 

164 

165 @property 

166 def indices(self) -> t.Union[str, t.List[str]]: 

167 """Return all indices found""" 

168 if not self.indices_data: 

169 self.indices_data = list(self.usage_stats.keys()) 

170 if len(self.indices_data) == 1: 170 ↛ 172line 170 didn't jump to line 172 because the condition on line 170 was always true

171 return self.indices_data[0] 

172 return self.indices_data 

173 

174 def sum_index_stats( 

175 self, field_usage: t.Dict[str, t.Any], idx: str 

176 ) -> t.Dict[str, int]: 

177 """Per field, sum all of the usage stats for all shards in ``idx``""" 

178 

179 def appender(result, field, value): 

180 if field not in result: 180 ↛ 183line 180 didn't jump to line 183 because the condition on line 180 was always true

181 result[field] = value 

182 else: 

183 result[field] += value 

184 return result 

185 

186 result = {} 

187 for shard in field_usage[idx]['shards']: 

188 for field in list(shard['stats']['fields'].keys()): 

189 if field in ['_id', '_source']: 189 ↛ 192line 189 didn't jump to line 192 because the condition on line 189 was never true

190 # We don't care about these because these can be used by 

191 # runtime queries 

192 continue 

193 result = appender(result, field, shard['stats']['fields'][field]['any']) 

194 return result