Coverage for src/html2json/html2json.py: 0%

46 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-25 18:24 -0700

1import re 

2from re import Match, Pattern 

3from typing import Any, cast 

4 

5from pyquery import PyQuery 

6 

7Template = dict[str, Any] 

8Data = dict[str, Any] 

9 

10CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501 

11 

12 

13def __extract( 

14 root: PyQuery, 

15 selector: str | None, 

16 prop: str | None, 

17 cleaners: list[str], 

18) -> str | None: 

19 try: 

20 tag = root.find(selector) if selector else root 

21 # Non-matching selector 

22 if not tag: 

23 return None 

24 except: # noqa: E722 

25 # Invalid selector 

26 return None 

27 

28 v: str 

29 if prop: 

30 v = str(tag.attr(prop)) 

31 else: 

32 v = ''.join(c for c in tag.contents() if isinstance(c, str)) 

33 if not v: 

34 v = str(tag.text()) 

35 v = v.strip() 

36 

37 for c in cleaners: 

38 m: Match = cast("Match", CLEANER_REGEX.match(c)) 

39 

40 v = ( 

41 re.sub(m.group("search"), m.group("sub"), v, count=(0 if m.group("flag") == "g" else 1)) 

42 if m.group("mode") == "s" 

43 else cast("Match", re.search(m.group("search"), v)).group(0) 

44 ) 

45 

46 return v 

47 

48 

49def collect(html: str, template: Template) -> Data: 

50 def collect_rec(root: PyQuery, template: Template, data: Data) -> None: 

51 for (t, s) in template.items(): 

52 if isinstance(s, dict): 

53 data[t] = {} 

54 collect_rec(root, s, data[t]) 

55 elif isinstance(s, list): 

56 if len(s) == 1 and isinstance(s[0], list): 

57 sub_selector, sub_template = s[0] 

58 

59 data[t] = [] 

60 for sub_root in root.find(sub_selector): 

61 data[t].append({}) 

62 collect_rec(sub_root, sub_template, data[t][-1]) 

63 elif len(s) == 2 and isinstance(s[1], dict): 

64 sub_selector, sub_template = s[0], s[1] 

65 

66 data[t] = {} 

67 collect_rec(root.find(sub_selector), sub_template, data[t]) 

68 elif len(s) == 3: 

69 data[t] = __extract(root, *s) 

70 

71 data: Data = {} 

72 collect_rec(PyQuery(html), template, data) 

73 

74 return data