Coverage for src/extratools_html/__init__.py: 40%
67 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-01 18:20 -0700
« prev ^ index » next coverage.py v7.8.0, created at 2025-05-01 18:20 -0700
1from __future__ import annotations
3import asyncio
4from collections.abc import Iterable
5from contextlib import suppress
6from datetime import timedelta
7from enum import StrEnum
8from http import HTTPStatus
9from typing import Any
11import backoff
12import httpx
13import truststore
14from html2text import HTML2Text
16with suppress(ImportError):
17 from playwright.async_api import Browser, async_playwright, expect
19from .cleanup import cleanup_page
21truststore.inject_into_ssl()
23MAX_TRIES: int = 3
24MAX_TIMEOUT: int = 60
25REQUEST_TIMEOUT: int = 10
28class PageElementAction(StrEnum):
29 CLICK = "click"
30 TO_BE_VISIBLE = "to_be_visible"
33async def __download_via_request(
34 page_url: str,
35 *,
36 user_agent: str | None = None,
37) -> str | None:
38 async with httpx.AsyncClient() as client:
39 response: httpx.Response = await client.get(
40 page_url,
41 follow_redirects=True,
42 timeout=REQUEST_TIMEOUT,
43 headers=(
44 {
45 "User-Agent": user_agent,
46 } if user_agent
47 else {}
48 ),
49 )
51 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS:
52 # It also triggers backoff if necessary
53 return None
55 response.raise_for_status()
57 return response.text
60async def __download_via_browser(
61 page_url: str,
62 *,
63 user_agent: str | None = None,
64 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
65 wait_for_loading: timedelta | None = None,
66) -> str | None:
67 async with async_playwright() as playwright:
68 browser: Browser = await playwright.chromium.launch()
69 await browser.new_context(
70 user_agent=user_agent,
71 )
73 page = await browser.new_page()
74 response = await page.goto(page_url)
75 if not response:
76 return None
77 if response.status == HTTPStatus.TOO_MANY_REQUESTS:
78 # It also triggers backoff if necessary
79 return None
81 for selector, action in pre_actions or []:
82 match action:
83 case PageElementAction.CLICK:
84 await page.locator(selector).click()
85 case PageElementAction.TO_BE_VISIBLE:
86 await expect(page.locator(selector)).to_be_visible()
88 if wait_for_loading:
89 await asyncio.sleep(wait_for_loading.seconds)
91 html: str = await page.content()
93 await browser.close()
95 return html
98@backoff.on_predicate(
99 backoff.expo,
100 max_tries=MAX_TRIES,
101 max_time=MAX_TIMEOUT,
102)
103async def download_page_async(
104 page_url: str,
105 *,
106 cleanup: bool = False,
107 text_only: bool = False,
108 user_agent: str | None = None,
109 use_browser: bool = False,
110 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None,
111 wait_for_loading: timedelta | None = None,
112) -> str | None:
113 page_html: str | None
114 if use_browser:
115 page_html = await __download_via_browser(
116 page_url,
117 user_agent=user_agent,
118 pre_actions=pre_actions,
119 wait_for_loading=wait_for_loading,
120 )
121 else:
122 page_html = await __download_via_request(
123 page_url,
124 user_agent=user_agent,
125 )
126 if page_html is None:
127 return None
129 if cleanup:
130 page_html = await cleanup_page(page_html)
132 if text_only:
133 h = HTML2Text()
134 h.ignore_images = True
135 h.ignore_links = True
136 return h.handle(page_html)
138 return page_html
141def download_page(
142 image_url: str,
143 **kwargs: Any,
144) -> str | None:
145 return asyncio.run(download_page_async(
146 image_url,
147 **kwargs,
148 ))