Coverage for jutil/xml.py: 92%

100 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2022-10-07 16:40 -0500

1import re 

2from typing import Optional, Iterable, Dict, Any, List 

3from xml import etree 

4from xml.etree.ElementTree import Element, SubElement 

5 

6 

7def _xml_element_value(el: Element, is_int: bool = False) -> Any: 

8 """ 

9 Gets XML Element value. 

10 :param el: Element 

11 :param is_int: If True return value is converted to int (if possible) 

12 :return: value of the element (int/str) 

13 """ 

14 # None 

15 if el.text is None: 

16 return None 

17 # int 

18 try: 

19 if is_int: 

20 return int(el.text) 

21 except Exception: # nosec 

22 pass 

23 # default to str if not empty 

24 s = str(el.text).strip() 

25 return s if s else None 

26 

27 

28def _xml_tag_filter(s: str, strip_namespaces: bool) -> str: 

29 """ 

30 Returns tag name and optionally strips namespaces. 

31 :param s: Tag name 

32 :param strip_namespaces: Strip namespace prefix 

33 :return: str 

34 """ 

35 if strip_namespaces: 35 ↛ 43line 35 didn't jump to line 43, because the condition on line 35 was never false

36 ns_end = s.find("}") 

37 if ns_end != -1: 

38 s = s[ns_end + 1 :] 

39 else: 

40 ns_end = s.find(":") 

41 if ns_end != -1: 41 ↛ 42line 41 didn't jump to line 42, because the condition on line 41 was never true

42 s = s[ns_end + 1 :] 

43 return s 

44 

45 

46def _xml_set_element_data_r( # pylint: disable=too-many-arguments,too-many-locals 

47 data: dict, 

48 el: Element, 

49 array_tags: Iterable[str], 

50 int_tags: Iterable[str], 

51 strip_namespaces: bool, 

52 parse_attributes: bool, 

53 value_key: str, 

54 attribute_prefix: str, 

55): 

56 

57 tag = _xml_tag_filter(el.tag, strip_namespaces) 

58 

59 # complex type? 

60 attrib = el.attrib if parse_attributes else {} 

61 is_complex = len(attrib) > 0 or len(list(el)) > 0 

62 is_array = tag in data or tag in array_tags 

63 is_int = not is_array and tag in int_tags 

64 

65 # set obj value 

66 value = _xml_element_value(el, is_int=is_int) 

67 if is_complex: 

68 obj = {} 

69 if value is not None: 

70 obj[value_key] = value 

71 else: 

72 obj = value 

73 

74 # set attributes 

75 for a_key, a_val in attrib.items(): 

76 obj[attribute_prefix + _xml_tag_filter(a_key, strip_namespaces)] = a_val # pytype: disable=unsupported-operands 

77 

78 # recurse children 

79 for el2 in list(el): 

80 _xml_set_element_data_r( 

81 obj, 

82 el2, 

83 array_tags=array_tags, 

84 int_tags=int_tags, 

85 strip_namespaces=strip_namespaces, 

86 parse_attributes=parse_attributes, 

87 value_key=value_key, 

88 attribute_prefix=attribute_prefix, 

89 ) 

90 

91 # store result 

92 if is_array: 

93 data.setdefault(tag, []) 

94 if not isinstance(data[tag], list): 

95 data[tag] = [data[tag]] 

96 data[tag].append(obj) 

97 else: 

98 if tag in data: 98 ↛ 99line 98 didn't jump to line 99, because the condition on line 98 was never true

99 raise Exception("XML parsing failed, tag {} collision".format(tag)) 

100 data[tag] = obj 

101 

102 

103def xml_to_dict( # pylint: disable=too-many-arguments,too-many-locals 

104 xml_bytes: bytes, 

105 tags: Optional[Iterable[str]] = None, 

106 array_tags: Optional[Iterable[str]] = None, 

107 int_tags: Optional[Iterable[str]] = None, 

108 strip_namespaces: bool = True, 

109 parse_attributes: bool = True, 

110 value_key: str = "@", 

111 attribute_prefix: str = "@", 

112 document_tag: bool = False, 

113) -> Dict[str, Any]: 

114 """ 

115 Parses XML string to dict. In case of simple elements (no children, no attributes) value is stored as is. 

116 For complex elements value is stored in key '@', attributes '@xxx' and children as sub-dicts. 

117 Optionally strips namespaces. 

118 

119 For example: 

120 <Doc version="1.2"> 

121 <A class="x"> 

122 <B class="x2">hello</B> 

123 </A> 

124 <A class="y"> 

125 <B class="y2">world</B> 

126 </A> 

127 <C>value node</C> 

128 </Doc> 

129 is returned as follows: 

130 {'@version': '1.2', 

131 'A': [{'@class': 'x', 'B': {'@': 'hello', '@class': 'x2'}}, 

132 {'@class': 'y', 'B': {'@': 'world', '@class': 'y2'}}], 

133 'C': 'value node'} 

134 

135 Args: 

136 xml_bytes: XML file contents in bytes 

137 tags: list of tags to parse (pass empty to return all chilren of top-level tag) 

138 array_tags: list of tags that should be treated as arrays by default 

139 int_tags: list of tags that should be treated as ints 

140 strip_namespaces: if true namespaces will be stripped 

141 parse_attributes: Elements with attributes are stored as complex types with '@' identifying text value and @xxx identifying each attribute 

142 value_key: Key to store (complex) element value. Default is '@' 

143 attribute_prefix: Key prefix to store element attribute values. Default is '@' 

144 document_tag: Set True if Document root tag should be included as well 

145 

146 Returns: dict 

147 """ 

148 if tags is None: 

149 tags = [] 

150 if array_tags is None: 

151 array_tags = [] 

152 if int_tags is None: 

153 int_tags = [] 

154 

155 root = etree.ElementTree.fromstring(xml_bytes) 

156 if tags: 

157 if document_tag: 157 ↛ 158line 157 didn't jump to line 158, because the condition on line 157 was never true

158 raise Exception( 

159 "xml_to_dict: document_tag=True does not make sense when using selective tag list " 

160 "since selective tag list finds tags from the whole document, not only directly under root document tag" 

161 ) 

162 root_elements: List[Element] = [] 

163 for tag in tags: 

164 root_elements.extend(root.iter(tag)) 

165 else: 

166 root_elements = list(root) 

167 

168 data: Dict[str, Any] = {} 

169 for el in root_elements: 

170 _xml_set_element_data_r( 

171 data, 

172 el, 

173 array_tags=array_tags, 

174 int_tags=int_tags, 

175 strip_namespaces=strip_namespaces, 

176 parse_attributes=parse_attributes, 

177 value_key=value_key, 

178 attribute_prefix=attribute_prefix, 

179 ) 

180 

181 # set root attributes 

182 if parse_attributes: 

183 for a_key, a_val in root.attrib.items(): 

184 data[attribute_prefix + _xml_tag_filter(a_key, strip_namespaces)] = a_val 

185 

186 return data if not document_tag else {root.tag: data} 

187 

188 

189def _xml_filter_tag_name(tag: str) -> str: 

190 return re.sub(r"\[\d+\]", "", tag) 

191 

192 

193def _xml_element_set_data_r(el: Element, data: dict, value_key: str, attribute_prefix: str): 

194 # print('_xml_element_set_data_r({}): {}'.format(el.tag, data)) 

195 if not hasattr(data, "items"): 

196 data = {"@": data} 

197 for k, v in data.items(): 

198 if k == value_key: 

199 el.text = str(v) 

200 elif k.startswith(attribute_prefix): 

201 el.set(k[1:], str(v)) 

202 elif isinstance(v, (list, tuple)): 

203 for v2 in v: 

204 el2 = SubElement(el, _xml_filter_tag_name(k)) 

205 assert isinstance(el2, Element) 

206 _xml_element_set_data_r(el2, v2, value_key, attribute_prefix) 

207 elif isinstance(v, dict): 

208 el2 = SubElement(el, _xml_filter_tag_name(k)) 

209 assert isinstance(el2, Element) 

210 _xml_element_set_data_r(el2, v, value_key, attribute_prefix) 

211 else: 

212 el2 = SubElement(el, _xml_filter_tag_name(k)) 

213 assert isinstance(el2, Element) 

214 el2.text = str(v) 

215 

216 

217def dict_to_element(doc: dict, value_key: str = "@", attribute_prefix: str = "@") -> Element: 

218 """ 

219 Generates XML Element from dict. 

220 Generates complex elements by assuming element attributes are prefixed with '@', and value is stored to plain '@' 

221 in case of complex element. Children are sub-dicts. 

222 

223 For example: 

224 { 

225 'Doc': { 

226 '@version': '1.2', 

227 'A': [{'@class': 'x', 'B': {'@': 'hello', '@class': 'x2'}}, 

228 {'@class': 'y', 'B': {'@': 'world', '@class': 'y2'}}], 

229 'C': 'value node', 

230 'D[]': 'value node line 1', 

231 'D[]': 'value node line 2', 

232 } 

233 } 

234 is returned as follows: 

235 <?xml version="1.0" ?> 

236 <Doc version="1.2"> 

237 <A class="x"> 

238 <B class="x2">hello</B> 

239 </A> 

240 <A class="y"> 

241 <B class="y2">world</B> 

242 </A> 

243 <C>value node</C> 

244 <D>value node line 1</D> 

245 <D>value node line 2</D> 

246 </Doc> 

247 

248 Args: 

249 doc: dict. Must have sigle root key dict. 

250 value_key: Key to store (complex) element value. Default is '@' 

251 attribute_prefix: Key prefix to store element attribute values. Default is '@' 

252 

253 Returns: xml.etree.ElementTree.Element 

254 """ 

255 if len(doc) != 1: 255 ↛ 256line 255 didn't jump to line 256, because the condition on line 255 was never true

256 raise Exception("Invalid data dict for XML generation, document root must have single element") 

257 

258 for tag, data in doc.items(): 258 ↛ 264line 258 didn't jump to line 264, because the loop on line 258 didn't complete

259 el = Element(_xml_filter_tag_name(tag)) 

260 assert isinstance(el, Element) 

261 _xml_element_set_data_r(el, data, value_key, attribute_prefix) 

262 return el # pytype: disable=bad-return-type 

263 

264 return Element("empty")