Coverage for src/html2json/__init__.py: 100%
44 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-25 23:15 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-25 23:15 -0700
1from __future__ import annotations
3import re
4from re import Match, Pattern
5from typing import Any, cast
7from pyquery import PyQuery
9Template = dict[str, Any]
10Data = dict[str, Any]
12__CLEANER_REGEX: Pattern = re.compile(r"(?P<mode>s)?(?P<sep>\W)(?P<search>(?:(?!(?P=sep)).)*)(?P=sep)(?:(?P<sub>(?:(?!(?P=sep)).)*)(?P=sep)(?P<flag>g)?)?") # noqa: E501
15def __extract(
16 root: PyQuery,
17 selector: str | None = None,
18 prop: str | None = None,
19 cleaners: list[str] | None = None,
20) -> str | list[str] | None:
21 try:
22 tags: PyQuery = root.find(selector) if selector else root
23 # Non-matching selector
24 if len(tags) == 0:
25 return None
26 except: # noqa: E722
27 # Invalid selector
28 return None
30 results: list[str] = []
32 # Must use `.items()` which returns `PyQuery` objects
33 for tag in tags.items():
34 v: str = str(
35 tag.attr(prop) if prop
36 else tag.text(),
37 ).strip()
39 for c in cleaners or []:
40 m: Match = cast("Match", __CLEANER_REGEX.match(c))
42 v = (
43 re.sub(
44 m.group("search"),
45 m.group("sub"),
46 v,
47 count=(0 if m.group("flag") == "g" else 1),
48 ) if m.group("mode") == "s"
49 else cast("Match", re.search(m.group("search"), v)).group(0)
50 )
52 results.append(v)
54 return results if len(results) > 1 else results[0]
57def collect(html: str, template: Template) -> Data:
58 def collect_rec(root: PyQuery, template: Template, data: Data) -> None:
59 for (t, s) in template.items():
60 if isinstance(s, dict):
61 data[t] = {}
62 collect_rec(root, s, data[t])
63 elif isinstance(s, list):
64 if len(s) == 1 and isinstance(s[0], list):
65 sub_selector, sub_template = s[0]
67 data[t] = []
68 # Must use `.items()` which returns `PyQuery` objects
69 for sub_root in root.find(sub_selector).items():
70 data[t].append({})
71 collect_rec(sub_root, sub_template, data[t][-1])
72 elif len(s) == 2 and isinstance(s[1], dict):
73 sub_selector, sub_template = s[0], s[1]
75 data[t] = {}
76 collect_rec(root.find(sub_selector), sub_template, data[t])
77 else:
78 data[t] = __extract(root, *s)
80 data: Data = {}
81 collect_rec(PyQuery(html), template, data)
83 return data