Coverage for src/extratools_html/__init__.py: 40%

67 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-01 18:20 -0700

1from __future__ import annotations 

2 

3import asyncio 

4from collections.abc import Iterable 

5from contextlib import suppress 

6from datetime import timedelta 

7from enum import StrEnum 

8from http import HTTPStatus 

9from typing import Any 

10 

11import backoff 

12import httpx 

13import truststore 

14from html2text import HTML2Text 

15 

16with suppress(ImportError): 

17 from playwright.async_api import Browser, async_playwright, expect 

18 

19from .cleanup import cleanup_page 

20 

21truststore.inject_into_ssl() 

22 

23MAX_TRIES: int = 3 

24MAX_TIMEOUT: int = 60 

25REQUEST_TIMEOUT: int = 10 

26 

27 

28class PageElementAction(StrEnum): 

29 CLICK = "click" 

30 TO_BE_VISIBLE = "to_be_visible" 

31 

32 

33async def __download_via_request( 

34 page_url: str, 

35 *, 

36 user_agent: str | None = None, 

37) -> str | None: 

38 async with httpx.AsyncClient() as client: 

39 response: httpx.Response = await client.get( 

40 page_url, 

41 follow_redirects=True, 

42 timeout=REQUEST_TIMEOUT, 

43 headers=( 

44 { 

45 "User-Agent": user_agent, 

46 } if user_agent 

47 else {} 

48 ), 

49 ) 

50 

51 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

52 # It also triggers backoff if necessary 

53 return None 

54 

55 response.raise_for_status() 

56 

57 return response.text 

58 

59 

60async def __download_via_browser( 

61 page_url: str, 

62 *, 

63 user_agent: str | None = None, 

64 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

65 wait_for_loading: timedelta | None = None, 

66) -> str | None: 

67 async with async_playwright() as playwright: 

68 browser: Browser = await playwright.chromium.launch() 

69 await browser.new_context( 

70 user_agent=user_agent, 

71 ) 

72 

73 page = await browser.new_page() 

74 response = await page.goto(page_url) 

75 if not response: 

76 return None 

77 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

78 # It also triggers backoff if necessary 

79 return None 

80 

81 for selector, action in pre_actions or []: 

82 match action: 

83 case PageElementAction.CLICK: 

84 await page.locator(selector).click() 

85 case PageElementAction.TO_BE_VISIBLE: 

86 await expect(page.locator(selector)).to_be_visible() 

87 

88 if wait_for_loading: 

89 await asyncio.sleep(wait_for_loading.seconds) 

90 

91 html: str = await page.content() 

92 

93 await browser.close() 

94 

95 return html 

96 

97 

98@backoff.on_predicate( 

99 backoff.expo, 

100 max_tries=MAX_TRIES, 

101 max_time=MAX_TIMEOUT, 

102) 

103async def download_page_async( 

104 page_url: str, 

105 *, 

106 cleanup: bool = False, 

107 text_only: bool = False, 

108 user_agent: str | None = None, 

109 use_browser: bool = False, 

110 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

111 wait_for_loading: timedelta | None = None, 

112) -> str | None: 

113 page_html: str | None 

114 if use_browser: 

115 page_html = await __download_via_browser( 

116 page_url, 

117 user_agent=user_agent, 

118 pre_actions=pre_actions, 

119 wait_for_loading=wait_for_loading, 

120 ) 

121 else: 

122 page_html = await __download_via_request( 

123 page_url, 

124 user_agent=user_agent, 

125 ) 

126 if page_html is None: 

127 return None 

128 

129 if cleanup: 

130 page_html = await cleanup_page(page_html) 

131 

132 if text_only: 

133 h = HTML2Text() 

134 h.ignore_images = True 

135 h.ignore_links = True 

136 return h.handle(page_html) 

137 

138 return page_html 

139 

140 

141def download_page( 

142 image_url: str, 

143 **kwargs: Any, 

144) -> str | None: 

145 return asyncio.run(download_page_async( 

146 image_url, 

147 **kwargs, 

148 ))