Coverage for src/extratools_html/__init__.py: 41%
61 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-01 06:58 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-01 06:58 -0700
1from __future__ import annotations
3import asyncio
4from collections.abc import Iterable
5from contextlib import suppress
6from enum import StrEnum
7from http import HTTPStatus
8from typing import Any
10import backoff
11import httpx
12import truststore
13from html2text import HTML2Text
15with suppress(ImportError):
16 from playwright.async_api import Browser, async_playwright
18from .cleanup import cleanup_page
20truststore.inject_into_ssl()
22MAX_TRIES: int = 3
23MAX_TIMEOUT: int = 60
24REQUEST_TIMEOUT: int = 10
27class PageElementAction(StrEnum):
28 CLICK = "click"
31async def __download_via_request(
32 page_url: str,
33 *,
34 user_agent: str | None = None,
35) -> str | None:
36 async with httpx.AsyncClient() as client:
37 response: httpx.Response = await client.get(
38 page_url,
39 follow_redirects=True,
40 timeout=REQUEST_TIMEOUT,
41 headers=(
42 {
43 "User-Agent": user_agent,
44 } if user_agent
45 else {}
46 ),
47 )
49 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
50 # It also triggers backoff if necessary
51 return None
53 response.raise_for_status()
55 return response.text
58async def __download_via_browser(
59 page_url: str,
60 *,
61 user_agent: str | None = None,
62 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
63) -> str | None:
64 async with async_playwright() as playwright:
65 browser: Browser = await playwright.chromium.launch()
66 await browser.new_context(
67 user_agent=user_agent,
68 )
70 page = await browser.new_page()
71 response = await page.goto(page_url)
72 if not response:
73 return None
74 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
75 # It also triggers backoff if necessary
76 return None
78 for selector, action in pre_actions or []:
79 match action:
80 case "click":
81 await page.locator(selector).click()
83 html: str = await page.content()
85 await browser.close()
87 return html
90@backoff.on_predicate(
91 backoff.expo,
92 max_tries=MAX_TRIES,
93 max_time=MAX_TIMEOUT,
94)
95async def download_page_async(
96 page_url: str,
97 *,
98 cleanup: bool = False,
99 text_only: bool = False,
100 user_agent: str | None = None,
101 use_browser: bool = False,
102 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
103) -> str | None:
104 page_html: str | None
105 if use_browser:
106 page_html = await __download_via_browser(
107 page_url,
108 user_agent=user_agent,
109 pre_actions=pre_actions,
110 )
111 else:
112 page_html = await __download_via_request(
113 page_url,
114 user_agent=user_agent,
115 )
116 if page_html is None:
117 return None
119 if cleanup:
120 page_html = await cleanup_page(page_html)
122 if text_only:
123 h = HTML2Text()
124 h.ignore_images = True
125 h.ignore_links = True
126 return h.handle(page_html)
128 return page_html
131def download_page(
132 image_url: str,
133 **kwargs: Any,
134) -> str | None:
135 return asyncio.run(download_page_async(
136 image_url,
137 **kwargs,
138 ))