Coverage for me2ai_mcp\tools\web.py: 0%

140 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-13 11:30 +0200

1""" 

2Web-related tools for ME2AI MCP servers. 

3 

4This module provides common tools for web content fetching, scraping, 

5and processing that can be used across different MCP servers. 

6""" 

7from typing import Dict, List, Any, Optional 

8import logging 

9import re 

10import urllib.parse 

11from dataclasses import dataclass 

12import requests 

13from ..base import BaseTool 

14 

15# Optional dependencies 

16try: 

17 from bs4 import BeautifulSoup 

18 BS4_AVAILABLE = True 

19except ImportError: 

20 BS4_AVAILABLE = False 

21 logging.warning("BeautifulSoup not available, some web tools will have limited functionality") 

22 

23# Configure logging 

24logger = logging.getLogger("me2ai-mcp-tools-web") 

25 

26 

27@dataclass 

28class WebFetchTool(BaseTool): 

29 """Tool for fetching web content.""" 

30 

31 name: str = "fetch_webpage" 

32 description: str = "Fetch content from a web page" 

33 user_agent: str = "ME2AI Web Fetcher/1.0" 

34 timeout: int = 30 

35 max_content_length: int = 1024 * 1024 # 1MB 

36 

37 async def execute(self, params: Dict[str, Any]) -> Dict[str, Any]: 

38 """Fetch a webpage and return its content. 

39  

40 Args: 

41 params: Dictionary containing: 

42 - url: URL to fetch 

43 - headers: Optional additional HTTP headers 

44 - timeout: Optional custom timeout in seconds 

45  

46 Returns: 

47 Dictionary containing fetch results 

48 """ 

49 url = params.get("url") 

50 if not url: 

51 return { 

52 "success": False, 

53 "error": "URL parameter is required" 

54 } 

55 

56 # Validate URL 

57 if not url.startswith(("http://", "https://")): 

58 return { 

59 "success": False, 

60 "error": f"Invalid URL scheme: {url}" 

61 } 

62 

63 # Prepare headers 

64 headers = { 

65 "User-Agent": self.user_agent, 

66 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 

67 "Accept-Language": "en-US,en;q=0.5" 

68 } 

69 

70 # Add custom headers if provided 

71 if "headers" in params and isinstance(params["headers"], dict): 

72 headers.update(params["headers"]) 

73 

74 # Get timeout 

75 timeout = params.get("timeout", self.timeout) 

76 

77 try: 

78 # Fetch the URL 

79 response = requests.get( 

80 url, 

81 headers=headers, 

82 timeout=timeout, 

83 stream=True # Use streaming to handle large responses 

84 ) 

85 

86 # Check status code 

87 response.raise_for_status() 

88 

89 # Check content type 

90 content_type = response.headers.get("Content-Type", "").lower() 

91 if not any(ct in content_type for ct in ["text/html", "text/plain", "application/json", "application/xml"]): 

92 return { 

93 "success": False, 

94 "error": f"Unsupported content type: {content_type}" 

95 } 

96 

97 # Check content length 

98 content_length = int(response.headers.get("Content-Length", 0)) 

99 if content_length > self.max_content_length: 

100 return { 

101 "success": False, 

102 "error": f"Content too large: {content_length} bytes (max {self.max_content_length})" 

103 } 

104 

105 # Get content (with reasonable size limit) 

106 content = response.text 

107 

108 # Extract basic info 

109 title = "" 

110 if BS4_AVAILABLE and "text/html" in content_type: 

111 soup = BeautifulSoup(content, "html.parser") 

112 title_tag = soup.find("title") 

113 if title_tag: 

114 title = title_tag.string 

115 

116 # Return results 

117 return { 

118 "success": True, 

119 "url": url, 

120 "status_code": response.status_code, 

121 "content_type": content_type, 

122 "content_length": len(content), 

123 "title": title, 

124 "content": content, 

125 "headers": dict(response.headers) 

126 } 

127 

128 except requests.RequestException as e: 

129 return { 

130 "success": False, 

131 "error": f"Request error: {str(e)}", 

132 "exception_type": type(e).__name__ 

133 } 

134 except Exception as e: 

135 return { 

136 "success": False, 

137 "error": f"Error fetching webpage: {str(e)}", 

138 "exception_type": type(e).__name__ 

139 } 

140 

141 

142@dataclass 

143class HTMLParserTool(BaseTool): 

144 """Tool for parsing and extracting information from HTML content.""" 

145 

146 name: str = "parse_html" 

147 description: str = "Parse and extract structured data from HTML content" 

148 

149 async def execute(self, params: Dict[str, Any]) -> Dict[str, Any]: 

150 """Parse HTML and extract structured information. 

151  

152 Args: 

153 params: Dictionary containing: 

154 - html: HTML content to parse 

155 - selectors: Optional dictionary of CSS selectors to extract 

156 - extract_metadata: Whether to extract metadata (default: True) 

157 - extract_text: Whether to extract main text (default: True) 

158  

159 Returns: 

160 Dictionary containing parse results 

161 """ 

162 if not BS4_AVAILABLE: 

163 return { 

164 "success": False, 

165 "error": "BeautifulSoup is not available" 

166 } 

167 

168 html = params.get("html") 

169 if not html: 

170 return { 

171 "success": False, 

172 "error": "HTML parameter is required" 

173 } 

174 

175 selectors = params.get("selectors", {}) 

176 extract_metadata = params.get("extract_metadata", True) 

177 extract_text = params.get("extract_text", True) 

178 

179 try: 

180 # Parse HTML 

181 soup = BeautifulSoup(html, "html.parser") 

182 

183 result = { 

184 "success": True, 

185 } 

186 

187 # Extract metadata if requested 

188 if extract_metadata: 

189 metadata = {} 

190 

191 # Title 

192 title_tag = soup.find("title") 

193 if title_tag: 

194 metadata["title"] = title_tag.string 

195 

196 # Meta tags 

197 meta_tags = {} 

198 for meta in soup.find_all("meta"): 

199 name = meta.get("name") or meta.get("property") 

200 content = meta.get("content") 

201 if name and content: 

202 meta_tags[name] = content 

203 metadata["meta_tags"] = meta_tags 

204 

205 result["metadata"] = metadata 

206 

207 # Extract text if requested 

208 if extract_text: 

209 # Extract main content text (remove scripts, styles, etc.) 

210 for tag in soup(["script", "style", "noscript", "iframe"]): 

211 tag.extract() 

212 

213 text = soup.get_text(separator="\n", strip=True) 

214 result["text"] = text 

215 

216 # Extract headings 

217 headings = [] 

218 for level in range(1, 7): 

219 for h in soup.find_all(f"h{level}"): 

220 headings.append({ 

221 "level": level, 

222 "text": h.get_text(strip=True) 

223 }) 

224 result["headings"] = headings 

225 

226 # Extract data using provided selectors 

227 if selectors: 

228 extracted = {} 

229 for name, selector in selectors.items(): 

230 if isinstance(selector, str): 

231 # Single element 

232 element = soup.select_one(selector) 

233 if element: 

234 extracted[name] = element.get_text(strip=True) 

235 elif isinstance(selector, dict) and "selector" in selector: 

236 # Advanced configuration 

237 elements = soup.select(selector["selector"]) 

238 

239 if "attribute" in selector: 

240 # Extract attribute value 

241 attr_name = selector["attribute"] 

242 values = [el.get(attr_name) for el in elements if el.get(attr_name)] 

243 else: 

244 # Extract text 

245 values = [el.get_text(strip=True) for el in elements] 

246 

247 if "multiple" in selector and selector["multiple"]: 

248 extracted[name] = values 

249 elif values: 

250 extracted[name] = values[0] 

251 

252 result["extracted"] = extracted 

253 

254 return result 

255 

256 except Exception as e: 

257 return { 

258 "success": False, 

259 "error": f"Error parsing HTML: {str(e)}", 

260 "exception_type": type(e).__name__ 

261 } 

262 

263 

264@dataclass 

265class URLUtilsTool(BaseTool): 

266 """Tool for URL manipulation and processing.""" 

267 

268 name: str = "url_utils" 

269 description: str = "Utilities for URL manipulation and processing" 

270 

271 async def execute(self, params: Dict[str, Any]) -> Dict[str, Any]: 

272 """Process and manipulate URLs. 

273  

274 Args: 

275 params: Dictionary containing: 

276 - url: URL to process 

277 - operation: Operation to perform (parse, join, normalize) 

278 - base_url: Base URL for join operation 

279 - path: Path to join with base URL 

280  

281 Returns: 

282 Dictionary containing operation results 

283 """ 

284 url = params.get("url") 

285 operation = params.get("operation", "parse") 

286 

287 try: 

288 if operation == "parse": 

289 if not url: 

290 return { 

291 "success": False, 

292 "error": "URL parameter is required for parse operation" 

293 } 

294 

295 # Parse URL 

296 parsed = urllib.parse.urlparse(url) 

297 

298 return { 

299 "success": True, 

300 "url": url, 

301 "parsed": { 

302 "scheme": parsed.scheme, 

303 "netloc": parsed.netloc, 

304 "path": parsed.path, 

305 "params": parsed.params, 

306 "query": parsed.query, 

307 "fragment": parsed.fragment, 

308 "username": parsed.username, 

309 "password": parsed.password, 

310 "hostname": parsed.hostname, 

311 "port": parsed.port 

312 }, 

313 "query_params": dict(urllib.parse.parse_qsl(parsed.query)) 

314 } 

315 

316 elif operation == "join": 

317 base_url = params.get("base_url") 

318 path = params.get("path") 

319 

320 if not base_url or not path: 

321 return { 

322 "success": False, 

323 "error": "base_url and path parameters are required for join operation" 

324 } 

325 

326 # Join URLs 

327 joined_url = urllib.parse.urljoin(base_url, path) 

328 

329 return { 

330 "success": True, 

331 "base_url": base_url, 

332 "path": path, 

333 "joined_url": joined_url 

334 } 

335 

336 elif operation == "normalize": 

337 if not url: 

338 return { 

339 "success": False, 

340 "error": "URL parameter is required for normalize operation" 

341 } 

342 

343 # Normalize URL 

344 normalized_url = urllib.parse.urljoin(url, urllib.parse.urlparse(url).path) 

345 

346 return { 

347 "success": True, 

348 "original_url": url, 

349 "normalized_url": normalized_url 

350 } 

351 

352 else: 

353 return { 

354 "success": False, 

355 "error": f"Unknown operation: {operation}" 

356 } 

357 

358 except Exception as e: 

359 return { 

360 "success": False, 

361 "error": f"Error processing URL: {str(e)}", 

362 "exception_type": type(e).__name__ 

363 }