Coverage for src/extratools_html/__init__.py: 42%

69 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-12 00:31 -0700

1from __future__ import annotations 

2 

3import asyncio 

4import ssl 

5from collections.abc import Iterable 

6from contextlib import suppress 

7from enum import StrEnum 

8from http import HTTPStatus 

9from typing import Any 

10 

11import backoff 

12import httpx 

13import truststore 

14from html2text import HTML2Text 

15 

16with suppress(ImportError): 

17 from playwright.async_api import Browser, async_playwright, expect 

18 from playwright.async_api import TimeoutError as PlaywrightTimeoutError 

19 

20from .cleanup import cleanup_page 

21 

22MAX_TRIES: int = 3 

23MAX_TIMEOUT: int = 60 

24REQUEST_TIMEOUT: int = 10 

25# In milliseconds 

26PRE_ACTION_TIMEOUT: int = 10 * 1_000 

27 

28ctx = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT) 

29 

30 

31class PageElementAction(StrEnum): 

32 CLICK = "click" 

33 TO_BE_VISIBLE = "to_be_visible" 

34 

35 

36async def __download_via_request( 

37 page_url: str, 

38 *, 

39 user_agent: str | None = None, 

40) -> str | None: 

41 # https://www.python-httpx.org/advanced/ssl/ 

42 async with httpx.AsyncClient(verify=ctx) as client: 

43 response: httpx.Response = await client.get( 

44 page_url, 

45 follow_redirects=True, 

46 timeout=REQUEST_TIMEOUT, 

47 headers=( 

48 { 

49 "User-Agent": user_agent, 

50 } if user_agent 

51 else {} 

52 ), 

53 ) 

54 

55 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

56 # It also triggers backoff if necessary 

57 return None 

58 

59 response.raise_for_status() 

60 

61 return response.text 

62 

63 

64async def __download_via_browser( 

65 page_url: str, 

66 *, 

67 user_agent: str | None = None, 

68 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

69) -> str | None: 

70 async with async_playwright() as playwright: 

71 browser: Browser = await playwright.chromium.launch() 

72 await browser.new_context( 

73 user_agent=user_agent, 

74 ) 

75 

76 page = await browser.new_page() 

77 await page.route( 

78 "**/*", 

79 lambda route: ( 

80 route.abort() 

81 # https://playwright.dev/python/docs/api/class-request#request-resource-type 

82 if route.request.resource_type in { 

83 "font", 

84 "image", 

85 "media", 

86 } 

87 else route.continue_() 

88 ), 

89 ) 

90 response = await page.goto(page_url) 

91 if not response: 

92 return None 

93 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

94 # It also triggers backoff if necessary 

95 return None 

96 

97 for selector, action in pre_actions or []: 

98 with suppress(AssertionError, PlaywrightTimeoutError): 

99 match action: 

100 case PageElementAction.CLICK: 

101 await page.locator(selector).click( 

102 timeout=PRE_ACTION_TIMEOUT, 

103 # Allow click even current element is covered by other elements. 

104 # Otherwise, other pre-actions are needed before this pre-action 

105 # to dismiss those covering elements. 

106 # However, it is possible that dismissing those covering elements 

107 # is necessary logic for page to function properly. 

108 force=True, 

109 ) 

110 case PageElementAction.TO_BE_VISIBLE: 

111 await expect(page.locator(selector)).to_be_visible( 

112 timeout=PRE_ACTION_TIMEOUT, 

113 ) 

114 

115 html: str = await page.content() 

116 

117 await browser.close() 

118 

119 return html 

120 

121 

122@backoff.on_predicate( 

123 backoff.expo, 

124 max_tries=MAX_TRIES, 

125 max_time=MAX_TIMEOUT, 

126) 

127async def download_page_async( 

128 page_url: str, 

129 *, 

130 cleanup: bool = False, 

131 text_only: bool = False, 

132 user_agent: str | None = None, 

133 use_browser: bool = False, 

134 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

135) -> str | None: 

136 page_html: str | None 

137 if use_browser: 

138 page_html = await __download_via_browser( 

139 page_url, 

140 user_agent=user_agent, 

141 pre_actions=pre_actions, 

142 ) 

143 else: 

144 page_html = await __download_via_request( 

145 page_url, 

146 user_agent=user_agent, 

147 ) 

148 if page_html is None: 

149 return None 

150 

151 if cleanup: 

152 page_html = await cleanup_page(page_html) 

153 

154 if text_only: 

155 h = HTML2Text() 

156 h.ignore_images = True 

157 h.ignore_links = True 

158 return h.handle(page_html) 

159 

160 return page_html 

161 

162 

163def download_page( 

164 image_url: str, 

165 **kwargs: Any, 

166) -> str | None: 

167 return asyncio.run(download_page_async( 

168 image_url, 

169 **kwargs, 

170 ))