Coverage for src/html2json/__init__.py: 100%

44 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-25 23:15 -0700

1from __future__ import annotations 

2 

3import re 

4from re import Match, Pattern 

5from typing import Any, cast 

6 

7from pyquery import PyQuery 

8 

9Template = dict[str, Any] 

10Data = dict[str, Any] 

11 

12__CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501 

13 

14 

15def __extract( 

16 root: PyQuery, 

17 selector: str | None = None, 

18 prop: str | None = None, 

19 cleaners: list[str] | None = None, 

20) -> str | list[str] | None: 

21 try: 

22 tags: PyQuery = root.find(selector) if selector else root 

23 # Non-matching selector 

24 if len(tags) == 0: 

25 return None 

26 except: # noqa: E722 

27 # Invalid selector 

28 return None 

29 

30 results: list[str] = [] 

31 

32 # Must use `.items()` which returns `PyQuery` objects 

33 for tag in tags.items(): 

34 v: str = str( 

35 tag.attr(prop) if prop 

36 else tag.text(), 

37 ).strip() 

38 

39 for c in cleaners or []: 

40 m: Match = cast("Match", __CLEANER_REGEX.match(c)) 

41 

42 v = ( 

43 re.sub( 

44 m.group("search"), 

45 m.group("sub"), 

46 v, 

47 count=(0 if m.group("flag") == "g" else 1), 

48 ) if m.group("mode") == "s" 

49 else cast("Match", re.search(m.group("search"), v)).group(0) 

50 ) 

51 

52 results.append(v) 

53 

54 return results if len(results) > 1 else results[0] 

55 

56 

57def collect(html: str, template: Template) -> Data: 

58 def collect_rec(root: PyQuery, template: Template, data: Data) -> None: 

59 for (t, s) in template.items(): 

60 if isinstance(s, dict): 

61 data[t] = {} 

62 collect_rec(root, s, data[t]) 

63 elif isinstance(s, list): 

64 if len(s) == 1 and isinstance(s[0], list): 

65 sub_selector, sub_template = s[0] 

66 

67 data[t] = [] 

68 # Must use `.items()` which returns `PyQuery` objects 

69 for sub_root in root.find(sub_selector).items(): 

70 data[t].append({}) 

71 collect_rec(sub_root, sub_template, data[t][-1]) 

72 elif len(s) == 2 and isinstance(s[1], dict): 

73 sub_selector, sub_template = s[0], s[1] 

74 

75 data[t] = {} 

76 collect_rec(root.find(sub_selector), sub_template, data[t]) 

77 else: 

78 data[t] = __extract(root, *s) 

79 

80 data: Data = {} 

81 collect_rec(PyQuery(html), template, data) 

82 

83 return data