Coverage for src/extratools_html/__init__.py: 41%

61 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-05-01 06:58 -0700

1from __future__ import annotations 

2 

3import asyncio 

4from collections.abc import Iterable 

5from contextlib import suppress 

6from enum import StrEnum 

7from http import HTTPStatus 

8from typing import Any 

9 

10import backoff 

11import httpx 

12import truststore 

13from html2text import HTML2Text 

14 

15with suppress(ImportError): 

16 from playwright.async_api import Browser, async_playwright 

17 

18from .cleanup import cleanup_page 

19 

20truststore.inject_into_ssl() 

21 

22MAX_TRIES: int = 3 

23MAX_TIMEOUT: int = 60 

24REQUEST_TIMEOUT: int = 10 

25 

26 

27class PageElementAction(StrEnum): 

28 CLICK = "click" 

29 

30 

31async def __download_via_request( 

32 page_url: str, 

33 *, 

34 user_agent: str | None = None, 

35) -> str | None: 

36 async with httpx.AsyncClient() as client: 

37 response: httpx.Response = await client.get( 

38 page_url, 

39 follow_redirects=True, 

40 timeout=REQUEST_TIMEOUT, 

41 headers=( 

42 { 

43 "User-Agent": user_agent, 

44 } if user_agent 

45 else {} 

46 ), 

47 ) 

48 

49 if response.status_code == HTTPStatus.TOO_MANY_REQUESTS: 

50 # It also triggers backoff if necessary 

51 return None 

52 

53 response.raise_for_status() 

54 

55 return response.text 

56 

57 

58async def __download_via_browser( 

59 page_url: str, 

60 *, 

61 user_agent: str | None = None, 

62 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

63) -> str | None: 

64 async with async_playwright() as playwright: 

65 browser: Browser = await playwright.chromium.launch() 

66 await browser.new_context( 

67 user_agent=user_agent, 

68 ) 

69 

70 page = await browser.new_page() 

71 response = await page.goto(page_url) 

72 if not response: 

73 return None 

74 if response.status == HTTPStatus.TOO_MANY_REQUESTS: 

75 # It also triggers backoff if necessary 

76 return None 

77 

78 for selector, action in pre_actions or []: 

79 match action: 

80 case "click": 

81 await page.locator(selector).click() 

82 

83 html: str = await page.content() 

84 

85 await browser.close() 

86 

87 return html 

88 

89 

90@backoff.on_predicate( 

91 backoff.expo, 

92 max_tries=MAX_TRIES, 

93 max_time=MAX_TIMEOUT, 

94) 

95async def download_page_async( 

96 page_url: str, 

97 *, 

98 cleanup: bool = False, 

99 text_only: bool = False, 

100 user_agent: str | None = None, 

101 use_browser: bool = False, 

102 pre_actions: Iterable[tuple[str, PageElementAction]] | None = None, 

103) -> str | None: 

104 page_html: str | None 

105 if use_browser: 

106 page_html = await __download_via_browser( 

107 page_url, 

108 user_agent=user_agent, 

109 pre_actions=pre_actions, 

110 ) 

111 else: 

112 page_html = await __download_via_request( 

113 page_url, 

114 user_agent=user_agent, 

115 ) 

116 if page_html is None: 

117 return None 

118 

119 if cleanup: 

120 page_html = await cleanup_page(page_html) 

121 

122 if text_only: 

123 h = HTML2Text() 

124 h.ignore_images = True 

125 h.ignore_links = True 

126 return h.handle(page_html) 

127 

128 return page_html 

129 

130 

131def download_page( 

132 image_url: str, 

133 **kwargs: Any, 

134) -> str | None: 

135 return asyncio.run(download_page_async( 

136 image_url, 

137 **kwargs, 

138 ))