Coverage for src/html2json/__init__.py: 67%

55 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-30 00:24 -0700

1from __future__ import annotations 

2 

3import json 

4import re 

5from re import Match, Pattern 

6from typing import Any, cast 

7 

8from pyquery import PyQuery 

9 

10Template = dict[str, Any] 

11Data = dict[str, Any] 

12 

13__CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501 

14 

15 

16def __extract( 

17 root: PyQuery, 

18 selector: str | None = None, 

19 prop: str | None = None, 

20 cleaners: list[str] | None = None, 

21) -> str | list[str] | None: 

22 try: 

23 tags: PyQuery = root.find(selector) if selector else root 

24 # Non-matching selector 

25 if len(tags) == 0: 

26 return None 

27 except: # noqa: E722 

28 # Invalid selector 

29 return None 

30 

31 results: list[str] = [] 

32 

33 # Must use `.items()` which returns `PyQuery` objects 

34 for tag in tags.items(): 

35 v: str = str( 

36 tag.attr(prop) if prop 

37 else tag.text(), 

38 ).strip() 

39 

40 for c in cleaners or []: 

41 m: Match = cast("Match", __CLEANER_REGEX.match(c)) 

42 

43 v = ( 

44 re.sub( 

45 m.group("search"), 

46 m.group("sub"), 

47 v, 

48 count=(0 if m.group("flag") == "g" else 1), 

49 ) if m.group("mode") == "s" 

50 else cast("Match", re.search(m.group("search"), v)).group(0) 

51 ) 

52 

53 results.append(v) 

54 

55 return results if len(results) > 1 else results[0] 

56 

57 

58def __collect_keys(root: PyQuery, key_template: str) -> list[str]: 

59 if key_template[0] == '[' and key_template[-1] == "]": 

60 keys: str | list[str] = __extract(root, *json.loads(key_template)) or [] 

61 return keys if isinstance(keys, list) else [keys] 

62 

63 return [key_template] 

64 

65 

66def __expand_template(root: PyQuery, template: Template) -> Template: 

67 return { 

68 key: value 

69 for key_template, value in template.items() 

70 for key in __collect_keys(root, key_template) 

71 } 

72 

73 

74def collect(html: str, template: Template) -> Data: 

75 def collect_rec(root: PyQuery, template: Template, data: Data) -> None: 

76 for (t, s) in __expand_template(root, template).items(): 

77 if isinstance(s, dict): 

78 data[t] = {} 

79 collect_rec(root, s, data[t]) 

80 elif isinstance(s, list): 

81 if len(s) == 1 and isinstance(s[0], list): 

82 sub_selector, sub_template = s[0] 

83 sub_selector = sub_selector.format(key=t) if sub_selector else None 

84 

85 data[t] = [] 

86 # Must use `.items()` which returns `PyQuery` objects 

87 for sub_root in root.find(sub_selector).items(): 

88 data[t].append({}) 

89 collect_rec(sub_root, sub_template, data[t][-1]) 

90 elif len(s) == 2 and isinstance(s[1], dict): 

91 sub_selector, sub_template = s[0], s[1] 

92 sub_selector = sub_selector.format(key=t) if sub_selector else None 

93 

94 data[t] = {} 

95 collect_rec(root.find(sub_selector), sub_template, data[t]) 

96 else: 

97 data[t] = ( 

98 __extract(root, s[0].format(key=t) if s[0] else None, *s[1:]) if s 

99 else __extract(root) 

100 ) 

101 else: 

102 data[t] = s.format(key=t) if isinstance(s, str) else s 

103 

104 data: Data = {} 

105 collect_rec(PyQuery(html), template, data) 

106 

107 return data