Coverage for src/html2json/__init__.py: 67%
55 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-30 00:24 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-30 00:24 -0700
1from __future__ import annotations
3import json
4import re
5from re import Match, Pattern
6from typing import Any, cast
8from pyquery import PyQuery
10Template = dict[str, Any]
11Data = dict[str, Any]
13__CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501
16def __extract(
17 root: PyQuery,
18 selector: str | None = None,
19 prop: str | None = None,
20 cleaners: list[str] | None = None,
21) -> str | list[str] | None:
22 try:
23 tags: PyQuery = root.find(selector) if selector else root
24 # Non-matching selector
25 if len(tags) == 0:
26 return None
27 except: # noqa: E722
28 # Invalid selector
29 return None
31 results: list[str] = []
33 # Must use `.items()` which returns `PyQuery` objects
34 for tag in tags.items():
35 v: str = str(
36 tag.attr(prop) if prop
37 else tag.text(),
38 ).strip()
40 for c in cleaners or []:
41 m: Match = cast("Match", __CLEANER_REGEX.match(c))
43 v = (
44 re.sub(
45 m.group("search"),
46 m.group("sub"),
47 v,
48 count=(0 if m.group("flag") == "g" else 1),
49 ) if m.group("mode") == "s"
50 else cast("Match", re.search(m.group("search"), v)).group(0)
51 )
53 results.append(v)
55 return results if len(results) > 1 else results[0]
58def __collect_keys(root: PyQuery, key_template: str) -> list[str]:
59 if key_template[0] == '[' and key_template[-1] == "]":
60 keys: str | list[str] = __extract(root, *json.loads(key_template)) or []
61 return keys if isinstance(keys, list) else [keys]
63 return [key_template]
66def __expand_template(root: PyQuery, template: Template) -> Template:
67 return {
68 key: value
69 for key_template, value in template.items()
70 for key in __collect_keys(root, key_template)
71 }
74def collect(html: str, template: Template) -> Data:
75 def collect_rec(root: PyQuery, template: Template, data: Data) -> None:
76 for (t, s) in __expand_template(root, template).items():
77 if isinstance(s, dict):
78 data[t] = {}
79 collect_rec(root, s, data[t])
80 elif isinstance(s, list):
81 if len(s) == 1 and isinstance(s[0], list):
82 sub_selector, sub_template = s[0]
83 sub_selector = sub_selector.format(key=t) if sub_selector else None
85 data[t] = []
86 # Must use `.items()` which returns `PyQuery` objects
87 for sub_root in root.find(sub_selector).items():
88 data[t].append({})
89 collect_rec(sub_root, sub_template, data[t][-1])
90 elif len(s) == 2 and isinstance(s[1], dict):
91 sub_selector, sub_template = s[0], s[1]
92 sub_selector = sub_selector.format(key=t) if sub_selector else None
94 data[t] = {}
95 collect_rec(root.find(sub_selector), sub_template, data[t])
96 else:
97 data[t] = (
98 __extract(root, s[0].format(key=t) if s[0] else None, *s[1:]) if s
99 else __extract(root)
100 )
101 else:
102 data[t] = s.format(key=t) if isinstance(s, str) else s
104 data: Data = {}
105 collect_rec(PyQuery(html), template, data)
107 return data