Coverage for me2ai_mcp\tools\web.py: 0%
140 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-13 11:30 +0200
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-13 11:30 +0200
1"""
2Web-related tools for ME2AI MCP servers.
4This module provides common tools for web content fetching, scraping,
5and processing that can be used across different MCP servers.
6"""
7from typing import Dict, List, Any, Optional
8import logging
9import re
10import urllib.parse
11from dataclasses import dataclass
12import requests
13from ..base import BaseTool
15# Optional dependencies
16try:
17 from bs4 import BeautifulSoup
18 BS4_AVAILABLE = True
19except ImportError:
20 BS4_AVAILABLE = False
21 logging.warning("BeautifulSoup not available, some web tools will have limited functionality")
23# Configure logging
24logger = logging.getLogger("me2ai-mcp-tools-web")
27@dataclass
28class WebFetchTool(BaseTool):
29 """Tool for fetching web content."""
31 name: str = "fetch_webpage"
32 description: str = "Fetch content from a web page"
33 user_agent: str = "ME2AI Web Fetcher/1.0"
34 timeout: int = 30
35 max_content_length: int = 1024 * 1024 # 1MB
37 async def execute(self, params: Dict[str, Any]) -> Dict[str, Any]:
38 """Fetch a webpage and return its content.
40 Args:
41 params: Dictionary containing:
42 - url: URL to fetch
43 - headers: Optional additional HTTP headers
44 - timeout: Optional custom timeout in seconds
46 Returns:
47 Dictionary containing fetch results
48 """
49 url = params.get("url")
50 if not url:
51 return {
52 "success": False,
53 "error": "URL parameter is required"
54 }
56 # Validate URL
57 if not url.startswith(("http://", "https://")):
58 return {
59 "success": False,
60 "error": f"Invalid URL scheme: {url}"
61 }
63 # Prepare headers
64 headers = {
65 "User-Agent": self.user_agent,
66 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
67 "Accept-Language": "en-US,en;q=0.5"
68 }
70 # Add custom headers if provided
71 if "headers" in params and isinstance(params["headers"], dict):
72 headers.update(params["headers"])
74 # Get timeout
75 timeout = params.get("timeout", self.timeout)
77 try:
78 # Fetch the URL
79 response = requests.get(
80 url,
81 headers=headers,
82 timeout=timeout,
83 stream=True # Use streaming to handle large responses
84 )
86 # Check status code
87 response.raise_for_status()
89 # Check content type
90 content_type = response.headers.get("Content-Type", "").lower()
91 if not any(ct in content_type for ct in ["text/html", "text/plain", "application/json", "application/xml"]):
92 return {
93 "success": False,
94 "error": f"Unsupported content type: {content_type}"
95 }
97 # Check content length
98 content_length = int(response.headers.get("Content-Length", 0))
99 if content_length > self.max_content_length:
100 return {
101 "success": False,
102 "error": f"Content too large: {content_length} bytes (max {self.max_content_length})"
103 }
105 # Get content (with reasonable size limit)
106 content = response.text
108 # Extract basic info
109 title = ""
110 if BS4_AVAILABLE and "text/html" in content_type:
111 soup = BeautifulSoup(content, "html.parser")
112 title_tag = soup.find("title")
113 if title_tag:
114 title = title_tag.string
116 # Return results
117 return {
118 "success": True,
119 "url": url,
120 "status_code": response.status_code,
121 "content_type": content_type,
122 "content_length": len(content),
123 "title": title,
124 "content": content,
125 "headers": dict(response.headers)
126 }
128 except requests.RequestException as e:
129 return {
130 "success": False,
131 "error": f"Request error: {str(e)}",
132 "exception_type": type(e).__name__
133 }
134 except Exception as e:
135 return {
136 "success": False,
137 "error": f"Error fetching webpage: {str(e)}",
138 "exception_type": type(e).__name__
139 }
142@dataclass
143class HTMLParserTool(BaseTool):
144 """Tool for parsing and extracting information from HTML content."""
146 name: str = "parse_html"
147 description: str = "Parse and extract structured data from HTML content"
149 async def execute(self, params: Dict[str, Any]) -> Dict[str, Any]:
150 """Parse HTML and extract structured information.
152 Args:
153 params: Dictionary containing:
154 - html: HTML content to parse
155 - selectors: Optional dictionary of CSS selectors to extract
156 - extract_metadata: Whether to extract metadata (default: True)
157 - extract_text: Whether to extract main text (default: True)
159 Returns:
160 Dictionary containing parse results
161 """
162 if not BS4_AVAILABLE:
163 return {
164 "success": False,
165 "error": "BeautifulSoup is not available"
166 }
168 html = params.get("html")
169 if not html:
170 return {
171 "success": False,
172 "error": "HTML parameter is required"
173 }
175 selectors = params.get("selectors", {})
176 extract_metadata = params.get("extract_metadata", True)
177 extract_text = params.get("extract_text", True)
179 try:
180 # Parse HTML
181 soup = BeautifulSoup(html, "html.parser")
183 result = {
184 "success": True,
185 }
187 # Extract metadata if requested
188 if extract_metadata:
189 metadata = {}
191 # Title
192 title_tag = soup.find("title")
193 if title_tag:
194 metadata["title"] = title_tag.string
196 # Meta tags
197 meta_tags = {}
198 for meta in soup.find_all("meta"):
199 name = meta.get("name") or meta.get("property")
200 content = meta.get("content")
201 if name and content:
202 meta_tags[name] = content
203 metadata["meta_tags"] = meta_tags
205 result["metadata"] = metadata
207 # Extract text if requested
208 if extract_text:
209 # Extract main content text (remove scripts, styles, etc.)
210 for tag in soup(["script", "style", "noscript", "iframe"]):
211 tag.extract()
213 text = soup.get_text(separator="\n", strip=True)
214 result["text"] = text
216 # Extract headings
217 headings = []
218 for level in range(1, 7):
219 for h in soup.find_all(f"h{level}"):
220 headings.append({
221 "level": level,
222 "text": h.get_text(strip=True)
223 })
224 result["headings"] = headings
226 # Extract data using provided selectors
227 if selectors:
228 extracted = {}
229 for name, selector in selectors.items():
230 if isinstance(selector, str):
231 # Single element
232 element = soup.select_one(selector)
233 if element:
234 extracted[name] = element.get_text(strip=True)
235 elif isinstance(selector, dict) and "selector" in selector:
236 # Advanced configuration
237 elements = soup.select(selector["selector"])
239 if "attribute" in selector:
240 # Extract attribute value
241 attr_name = selector["attribute"]
242 values = [el.get(attr_name) for el in elements if el.get(attr_name)]
243 else:
244 # Extract text
245 values = [el.get_text(strip=True) for el in elements]
247 if "multiple" in selector and selector["multiple"]:
248 extracted[name] = values
249 elif values:
250 extracted[name] = values[0]
252 result["extracted"] = extracted
254 return result
256 except Exception as e:
257 return {
258 "success": False,
259 "error": f"Error parsing HTML: {str(e)}",
260 "exception_type": type(e).__name__
261 }
264@dataclass
265class URLUtilsTool(BaseTool):
266 """Tool for URL manipulation and processing."""
268 name: str = "url_utils"
269 description: str = "Utilities for URL manipulation and processing"
271 async def execute(self, params: Dict[str, Any]) -> Dict[str, Any]:
272 """Process and manipulate URLs.
274 Args:
275 params: Dictionary containing:
276 - url: URL to process
277 - operation: Operation to perform (parse, join, normalize)
278 - base_url: Base URL for join operation
279 - path: Path to join with base URL
281 Returns:
282 Dictionary containing operation results
283 """
284 url = params.get("url")
285 operation = params.get("operation", "parse")
287 try:
288 if operation == "parse":
289 if not url:
290 return {
291 "success": False,
292 "error": "URL parameter is required for parse operation"
293 }
295 # Parse URL
296 parsed = urllib.parse.urlparse(url)
298 return {
299 "success": True,
300 "url": url,
301 "parsed": {
302 "scheme": parsed.scheme,
303 "netloc": parsed.netloc,
304 "path": parsed.path,
305 "params": parsed.params,
306 "query": parsed.query,
307 "fragment": parsed.fragment,
308 "username": parsed.username,
309 "password": parsed.password,
310 "hostname": parsed.hostname,
311 "port": parsed.port
312 },
313 "query_params": dict(urllib.parse.parse_qsl(parsed.query))
314 }
316 elif operation == "join":
317 base_url = params.get("base_url")
318 path = params.get("path")
320 if not base_url or not path:
321 return {
322 "success": False,
323 "error": "base_url and path parameters are required for join operation"
324 }
326 # Join URLs
327 joined_url = urllib.parse.urljoin(base_url, path)
329 return {
330 "success": True,
331 "base_url": base_url,
332 "path": path,
333 "joined_url": joined_url
334 }
336 elif operation == "normalize":
337 if not url:
338 return {
339 "success": False,
340 "error": "URL parameter is required for normalize operation"
341 }
343 # Normalize URL
344 normalized_url = urllib.parse.urljoin(url, urllib.parse.urlparse(url).path)
346 return {
347 "success": True,
348 "original_url": url,
349 "normalized_url": normalized_url
350 }
352 else:
353 return {
354 "success": False,
355 "error": f"Unknown operation: {operation}"
356 }
358 except Exception as e:
359 return {
360 "success": False,
361 "error": f"Error processing URL: {str(e)}",
362 "exception_type": type(e).__name__
363 }