Coverage for src/extratools_html/__init__.py: 42%
69 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-12 00:31 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-12 00:31 -0700
1from __future__ import annotations
3import asyncio
4import ssl
5from collections.abc import Iterable
6from contextlib import suppress
7from enum import StrEnum
8from http import HTTPStatus
9from typing import Any
11import backoff
12import httpx
13import truststore
14from html2text import HTML2Text
16with suppress(ImportError):
17 from playwright.async_api import Browser, async_playwright, expect
18 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
20from .cleanup import cleanup_page
22MAX_TRIES: int = 3
23MAX_TIMEOUT: int = 60
24REQUEST_TIMEOUT: int = 10
25# In milliseconds
26PRE_ACTION_TIMEOUT: int = 10 * 1_000
28ctx = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
31class PageElementAction(StrEnum):
32 CLICK = "click"
33 TO_BE_VISIBLE = "to_be_visible"
36async def __download_via_request(
37 page_url: str,
38 *,
39 user_agent: str | None = None,
40) -> str | None:
41 # https://www.python-httpx.org/advanced/ssl/
42 async with httpx.AsyncClient(verify=ctx) as client:
43 response: httpx.Response = await client.get(
44 page_url,
45 follow_redirects=True,
46 timeout=REQUEST_TIMEOUT,
47 headers=(
48 {
49 "User-Agent": user_agent,
50 } if user_agent
51 else {}
52 ),
53 )
55 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
56 # It also triggers backoff if necessary
57 return None
59 response.raise_for_status()
61 return response.text
64async def __download_via_browser(
65 page_url: str,
66 *,
67 user_agent: str | None = None,
68 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
69) -> str | None:
70 async with async_playwright() as playwright:
71 browser: Browser = await playwright.chromium.launch()
72 await browser.new_context(
73 user_agent=user_agent,
74 )
76 page = await browser.new_page()
77 await page.route(
78 "**/*",
79 lambda route: (
80 route.abort()
81 # https://playwright.dev/python/docs/api/class-request#request-resource-type
82 if route.request.resource_type in {
83 "font",
84 "image",
85 "media",
86 }
87 else route.continue_()
88 ),
89 )
90 response = await page.goto(page_url)
91 if not response:
92 return None
93 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
94 # It also triggers backoff if necessary
95 return None
97 for selector, action in pre_actions or []:
98 with suppress(AssertionError, PlaywrightTimeoutError):
99 match action:
100 case PageElementAction.CLICK:
101 await page.locator(selector).click(
102 timeout=PRE_ACTION_TIMEOUT,
103 # Allow click even current element is covered by other elements.
104 # Otherwise, other pre-actions are needed before this pre-action
105 # to dismiss those covering elements.
106 # However, it is possible that dismissing those covering elements
107 # is necessary logic for page to function properly.
108 force=True,
109 )
110 case PageElementAction.TO_BE_VISIBLE:
111 await expect(page.locator(selector)).to_be_visible(
112 timeout=PRE_ACTION_TIMEOUT,
113 )
115 html: str = await page.content()
117 await browser.close()
119 return html
122@backoff.on_predicate(
123 backoff.expo,
124 max_tries=MAX_TRIES,
125 max_time=MAX_TIMEOUT,
126)
127async def download_page_async(
128 page_url: str,
129 *,
130 cleanup: bool = False,
131 text_only: bool = False,
132 user_agent: str | None = None,
133 use_browser: bool = False,
134 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
135) -> str | None:
136 page_html: str | None
137 if use_browser:
138 page_html = await __download_via_browser(
139 page_url,
140 user_agent=user_agent,
141 pre_actions=pre_actions,
142 )
143 else:
144 page_html = await __download_via_request(
145 page_url,
146 user_agent=user_agent,
147 )
148 if page_html is None:
149 return None
151 if cleanup:
152 page_html = await cleanup_page(page_html)
154 if text_only:
155 h = HTML2Text()
156 h.ignore_images = True
157 h.ignore_links = True
158 return h.handle(page_html)
160 return page_html
163def download_page(
164 image_url: str,
165 **kwargs: Any,
166) -> str | None:
167 return asyncio.run(download_page_async(
168 image_url,
169 **kwargs,
170 ))