Coverage for src/extratools_html/__init__.py: 53%

98 statements  

« prev     ^ index     » next       coverage.py v7.8.1, created at 2025-05-29 23:22 -0700

1from __future__ import annotations 

2 

3import asyncio 

4import ssl 

5from collections.abc import Iterable 

6from contextlib import suppress 

7from datetime import timedelta 

8from enum import StrEnum 

9from http import HTTPStatus 

10from typing import Any, cast 

11from urllib.parse import urlparse 

12 

13import backoff 

14import httpx 

15import minify_html 

16import truststore 

17from blob_dict.blob import StrBlob 

18from blob_dict.dict.path import LocalPath, PathBlobDict 

19from extratools_core.path import cleanup_dir_by_ttl 

20from extratools_core.typing import PathLike 

21from html2text import HTML2Text 

22 

23with suppress(ImportError): 

24 from playwright.async_api import Browser, async_playwright, expect 

25 from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

26 

27from .cleanup import cleanup_page 

28 

29MAX_TRIES: int = 3 

30MAX_TIMEOUT: int = 60 

31REQUEST_TIMEOUT: int = 10 

32# In milliseconds 

33PRE_ACTION_TIMEOUT: int = 10 * 1_000 

34 

35ctx = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT) 

36 

37# TODO: Make cache path/TTL configurable via configuration file (in TOML). 

38# It will also allow support for other non-local path (like `CloudPath`). 

39CACHE_PATH: PathLike = LocalPath("~/.http-cache").expanduser() 

40CACHE_TTL: timedelta = timedelta(days=1) 

41# Trigger cleanup here as we do not have CRON job or daemon process 

42# If available, native solution is preferred (like S3's object lifecycle management). 

43# Use longer TTL here than cache TTL in case we still somehow need raw data 

44list(cleanup_dir_by_ttl(CACHE_PATH, timedelta(days=30))) 

45 

46cache = PathBlobDict(CACHE_PATH, blob_class=StrBlob, ttl=CACHE_TTL) 

47cache.create() 

48 

49 

50class PageElementAction(StrEnum): 

51 CLICK = "click" 

52 TO_BE_VISIBLE = "to_be_visible" 

53 

54 

55async def __download_via_request( 

56 page_url: str, 

57 *, 

58 user_agent: str | None = None, 

59) -> str | None: 

60 # https://www.python-httpx.org/advanced/ssl/ 

61 async with httpx.AsyncClient(verify=ctx) as client: 

62 response: httpx.Response = await client.get( 

63 page_url, 

64 follow_redirects=True, 

65 timeout=REQUEST_TIMEOUT, 

66 headers=( 

67 { 

68 "User-Agent": user_agent, 

69 } if user_agent 

70 else {} 

71 ), 

72 ) 

73 

74 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

75 # It also triggers backoff if necessary 

76 return None 

77 

78 response.raise_for_status() 

79 

80 return response.text 

81 

82 

83async def __download_via_browser( 

84 page_url: str, 

85 *, 

86 user_agent: str | None = None, 

87 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

88) -> str | None: 

89 async with async_playwright() as playwright: 

90 browser: Browser = await playwright.chromium.launch() 

91 await browser.new_context( 

92 user_agent=user_agent, 

93 ) 

94 

95 page = await browser.new_page() 

96 await page.route( 

97 "**/*", 

98 lambda route: ( 

99 route.abort() 

100 # https://playwright.dev/python/docs/api/class-request#request-resource-type 

101 if route.request.resource_type in { 

102 "font", 

103 "image", 

104 "media", 

105 } 

106 else route.continue_() 

107 ), 

108 ) 

109 response = await page.goto(page_url) 

110 if not response: 

111 return None 

112 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

113 # It also triggers backoff if necessary 

114 return None 

115 

116 for selector, action in pre_actions or []: 

117 with suppress(AssertionError, PlaywrightTimeoutError): 

118 match action: 

119 case PageElementAction.CLICK: 

120 await page.locator(selector).click( 

121 timeout=PRE_ACTION_TIMEOUT, 

122 # Allow click even current element is covered by other elements. 

123 # Otherwise, other pre-actions are needed before this pre-action 

124 # to dismiss those covering elements. 

125 # However, it is possible that dismissing those covering elements 

126 # is necessary logic for page to function properly. 

127 force=True, 

128 ) 

129 case PageElementAction.TO_BE_VISIBLE: 

130 await expect(page.locator(selector)).to_be_visible( 

131 timeout=PRE_ACTION_TIMEOUT, 

132 ) 

133 

134 html: str = await page.content() 

135 

136 await browser.close() 

137 

138 return html 

139 

140 

141def get_cache_key(page_url: str) -> str: 

142 parse_result = urlparse(page_url) 

143 

144 # Need to handle reserved characters for filename 

145 # https://en.wikipedia.org/wiki/Filename#Reserved_characters_and_words 

146 dir: str = parse_result.netloc.replace(":", "_") 

147 

148 path: str = parse_result.path or "/" 

149 # Add default filename 

150 if path.endswith("/"): 

151 path += "?" 

152 

153 if parse_result.query: 

154 if not path.endswith("/?"): 

155 path += "?" 

156 path += parse_result.query 

157 

158 return dir + path 

159 

160 

161@backoff.on_predicate( 

162 backoff.expo, 

163 max_tries=MAX_TRIES, 

164 max_time=MAX_TIMEOUT, 

165) 

166async def download_page_async( 

167 page_url: str, 

168 *, 

169 cleanup: bool = False, 

170 text_only: bool = False, 

171 minify: bool = True, 

172 user_agent: str | None = None, 

173 use_browser: bool = False, 

174 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

175 use_cache: bool = True, 

176) -> str | None: 

177 page_html: str | None 

178 cache_key: str = get_cache_key(page_url) 

179 

180 if use_cache and (cache_blob := cache.get(cache_key)): 

181 page_html = cast("StrBlob", cache_blob).as_str() 

182 elif use_browser: 

183 page_html = await __download_via_browser( 

184 page_url, 

185 user_agent=user_agent, 

186 pre_actions=pre_actions, 

187 ) 

188 else: 

189 page_html = await __download_via_request( 

190 page_url, 

191 user_agent=user_agent, 

192 ) 

193 if page_html is None: 

194 return None 

195 

196 cache[cache_key] = StrBlob(page_html) 

197 

198 if minify: 

199 page_html = minify_html.minify(page_html) 

200 

201 if cleanup: 

202 page_html = await cleanup_page(page_html) 

203 

204 if text_only: 

205 h = HTML2Text() 

206 h.ignore_images = True 

207 h.ignore_links = True 

208 return h.handle(page_html) 

209 

210 return page_html 

211 

212 

213def download_page( 

214 page_url: str, 

215 **kwargs: Any, 

216) -> str | None: 

217 return asyncio.run(download_page_async( 

218 page_url, 

219 **kwargs, 

220 ))