Coverage for src/extratools_html/__init__.py: 53%
98 statements
« prev ^ index » next coverage.py v7.8.1, created at 2025-05-29 23:22 -0700
« prev ^ index » next coverage.py v7.8.1, created at 2025-05-29 23:22 -0700
1from __future__ import annotations
3import asyncio
4import ssl
5from collections.abc import Iterable
6from contextlib import suppress
7from datetime import timedelta
8from enum import StrEnum
9from http import HTTPStatus
10from typing import Any, cast
11from urllib.parse import urlparse
13import backoff
14import httpx
15import minify_html
16import truststore
17from blob_dict.blob import StrBlob
18from blob_dict.dict.path import LocalPath, PathBlobDict
19from extratools_core.path import cleanup_dir_by_ttl
20from extratools_core.typing import PathLike
21from html2text import HTML2Text
23with suppress(ImportError):
24 from playwright.async_api import Browser, async_playwright, expect
25 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
27from .cleanup import cleanup_page
29MAX_TRIES: int = 3
30MAX_TIMEOUT: int = 60
31REQUEST_TIMEOUT: int = 10
32# In milliseconds
33PRE_ACTION_TIMEOUT: int = 10 * 1_000
35ctx = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
37# TODO: Make cache path/TTL configurable via configuration file (in TOML).
38# It will also allow support for other non-local path (like `CloudPath`).
39CACHE_PATH: PathLike = LocalPath("~/.http-cache").expanduser()
40CACHE_TTL: timedelta = timedelta(days=1)
41# Trigger cleanup here as we do not have CRON job or daemon process
42# If available, native solution is preferred (like S3's object lifecycle management).
43# Use longer TTL here than cache TTL in case we still somehow need raw data
44list(cleanup_dir_by_ttl(CACHE_PATH, timedelta(days=30)))
46cache = PathBlobDict(CACHE_PATH, blob_class=StrBlob, ttl=CACHE_TTL)
47cache.create()
50class PageElementAction(StrEnum):
51 CLICK = "click"
52 TO_BE_VISIBLE = "to_be_visible"
55async def __download_via_request(
56 page_url: str,
57 *,
58 user_agent: str | None = None,
59) -> str | None:
60 # https://www.python-httpx.org/advanced/ssl/
61 async with httpx.AsyncClient(verify=ctx) as client:
62 response: httpx.Response = await client.get(
63 page_url,
64 follow_redirects=True,
65 timeout=REQUEST_TIMEOUT,
66 headers=(
67 {
68 "User-Agent": user_agent,
69 } if user_agent
70 else {}
71 ),
72 )
74 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
75 # It also triggers backoff if necessary
76 return None
78 response.raise_for_status()
80 return response.text
83async def __download_via_browser(
84 page_url: str,
85 *,
86 user_agent: str | None = None,
87 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
88) -> str | None:
89 async with async_playwright() as playwright:
90 browser: Browser = await playwright.chromium.launch()
91 await browser.new_context(
92 user_agent=user_agent,
93 )
95 page = await browser.new_page()
96 await page.route(
97 "**/*",
98 lambda route: (
99 route.abort()
100 # https://playwright.dev/python/docs/api/class-request#request-resource-type
101 if route.request.resource_type in {
102 "font",
103 "image",
104 "media",
105 }
106 else route.continue_()
107 ),
108 )
109 response = await page.goto(page_url)
110 if not response:
111 return None
112 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
113 # It also triggers backoff if necessary
114 return None
116 for selector, action in pre_actions or []:
117 with suppress(AssertionError, PlaywrightTimeoutError):
118 match action:
119 case PageElementAction.CLICK:
120 await page.locator(selector).click(
121 timeout=PRE_ACTION_TIMEOUT,
122 # Allow click even current element is covered by other elements.
123 # Otherwise, other pre-actions are needed before this pre-action
124 # to dismiss those covering elements.
125 # However, it is possible that dismissing those covering elements
126 # is necessary logic for page to function properly.
127 force=True,
128 )
129 case PageElementAction.TO_BE_VISIBLE:
130 await expect(page.locator(selector)).to_be_visible(
131 timeout=PRE_ACTION_TIMEOUT,
132 )
134 html: str = await page.content()
136 await browser.close()
138 return html
141def get_cache_key(page_url: str) -> str:
142 parse_result = urlparse(page_url)
144 # Need to handle reserved characters for filename
145 # https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words
146 dir: str = parse_result.netloc.replace(":", "_")
148 path: str = parse_result.path or "/"
149 # Add default filename
150 if path.endswith("/"):
151 path += "?"
153 if parse_result.query:
154 if not path.endswith("/?"):
155 path += "?"
156 path += parse_result.query
158 return dir + path
161@backoff.on_predicate(
162 backoff.expo,
163 max_tries=MAX_TRIES,
164 max_time=MAX_TIMEOUT,
165)
166async def download_page_async(
167 page_url: str,
168 *,
169 cleanup: bool = False,
170 text_only: bool = False,
171 minify: bool = True,
172 user_agent: str | None = None,
173 use_browser: bool = False,
174 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
175 use_cache: bool = True,
176) -> str | None:
177 page_html: str | None
178 cache_key: str = get_cache_key(page_url)
180 if use_cache and (cache_blob := cache.get(cache_key)):
181 page_html = cast("StrBlob", cache_blob).as_str()
182 elif use_browser:
183 page_html = await __download_via_browser(
184 page_url,
185 user_agent=user_agent,
186 pre_actions=pre_actions,
187 )
188 else:
189 page_html = await __download_via_request(
190 page_url,
191 user_agent=user_agent,
192 )
193 if page_html is None:
194 return None
196 cache[cache_key] = StrBlob(page_html)
198 if minify:
199 page_html = minify_html.minify(page_html)
201 if cleanup:
202 page_html = await cleanup_page(page_html)
204 if text_only:
205 h = HTML2Text()
206 h.ignore_images = True
207 h.ignore_links = True
208 return h.handle(page_html)
210 return page_html
213def download_page(
214 page_url: str,
215 **kwargs: Any,
216) -> str | None:
217 return asyncio.run(download_page_async(
218 page_url,
219 **kwargs,
220 ))