Coverage for me2ai_mcp\utils.py: 0%

103 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-13 11:30 +0200

1""" 

2Utility functions for ME2AI MCP servers. 

3 

4This module provides common utility functions for processing inputs, 

5formatting responses, and handling data in MCP servers. 

6""" 

7from typing import Dict, List, Any, Optional, Union 

8import re 

9import json 

10import logging 

11from datetime import datetime 

12import textwrap 

13 

14# Optional dependencies 

15try: 

16 import bleach 

17 BLEACH_AVAILABLE = True 

18except ImportError: 

19 BLEACH_AVAILABLE = False 

20 

21try: 

22 from bs4 import BeautifulSoup 

23 BS4_AVAILABLE = True 

24except ImportError: 

25 BS4_AVAILABLE = False 

26 

27# Configure logging 

28logger = logging.getLogger("me2ai-mcp-utils") 

29 

30 

31def sanitize_input(text: str, max_length: int = 10000) -> str: 

32 """Sanitize text input for safe processing. 

33  

34 Args: 

35 text: Input text 

36 max_length: Maximum allowed length 

37  

38 Returns: 

39 Sanitized text 

40 """ 

41 # Check input type 

42 if not isinstance(text, str): 

43 logger.warning(f"Expected string input, got {type(text)}") 

44 text = str(text) 

45 

46 # Truncate to maximum length 

47 if len(text) > max_length: 

48 logger.warning(f"Input truncated from {len(text)} to {max_length} characters") 

49 text = text[:max_length] 

50 

51 # Remove null bytes and other control characters 

52 text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text) 

53 

54 # Use bleach for HTML sanitization if available 

55 if BLEACH_AVAILABLE: 

56 text = bleach.clean(text, strip=True) 

57 

58 return text 

59 

60 

61def format_response( 

62 data: Any, 

63 format_type: str = "auto", 

64 success: bool = True, 

65 error: Optional[str] = None 

66) -> Dict[str, Any]: 

67 """Format a standard MCP response. 

68  

69 Args: 

70 data: Response data 

71 format_type: Response format type (auto, text, json, html) 

72 success: Whether the operation was successful 

73 error: Error message if unsuccessful 

74  

75 Returns: 

76 Formatted response dictionary 

77 """ 

78 # Basic response structure 

79 response = { 

80 "success": success, 

81 "timestamp": datetime.now().isoformat() 

82 } 

83 

84 # Add error message if provided 

85 if error: 

86 response["error"] = error 

87 response["success"] = False 

88 

89 # Auto-detect format if needed 

90 if format_type == "auto": 

91 if isinstance(data, dict) or isinstance(data, list): 

92 format_type = "json" 

93 elif isinstance(data, str) and data.strip().startswith(("<html", "<!DOCTYPE")): 

94 format_type = "html" 

95 else: 

96 format_type = "text" 

97 

98 # Format data based on type 

99 if format_type == "json": 

100 response["format"] = "json" 

101 response["content"] = data 

102 elif format_type == "html": 

103 response["format"] = "html" 

104 response["content"] = data 

105 else: # Default to text 

106 response["format"] = "text" 

107 response["content"] = str(data) 

108 

109 # Add format-specific metadata 

110 if format_type == "text": 

111 response["content_length"] = len(response["content"]) 

112 lines = response["content"].split("\n") 

113 response["line_count"] = len(lines) 

114 

115 return response 

116 

117 

118def extract_text( 

119 html_content: str, 

120 max_length: int = 10000, 

121 include_headings: bool = True, 

122 include_links: bool = True 

123) -> str: 

124 """Extract readable text from HTML content. 

125  

126 Args: 

127 html_content: HTML content 

128 max_length: Maximum length of extracted text 

129 include_headings: Whether to include headings 

130 include_links: Whether to include link information 

131  

132 Returns: 

133 Extracted text 

134 """ 

135 # Check if BeautifulSoup is available 

136 if not BS4_AVAILABLE: 

137 logger.warning("BeautifulSoup not available, falling back to basic HTML removal") 

138 # Basic HTML tag removal with regex 

139 text = re.sub(r'<[^>]*>', ' ', html_content) 

140 text = re.sub(r'\s+', ' ', text).strip() 

141 return text[:max_length] 

142 

143 # Parse HTML with BeautifulSoup 

144 soup = BeautifulSoup(html_content, 'html.parser') 

145 

146 # Remove script and style elements 

147 for element in soup(["script", "style", "noscript", "iframe", "footer"]): 

148 element.extract() 

149 

150 # Get all text 

151 text = soup.get_text(separator='\n', strip=True) 

152 

153 # Add headings with emphasis if requested 

154 if include_headings: 

155 headings = [] 

156 for tag in ['h1', 'h2', 'h3', 'h4']: 

157 for heading in soup.find_all(tag): 

158 heading_text = heading.get_text(strip=True) 

159 if heading_text: 

160 level = int(tag[1]) 

161 prefix = "#" * level 

162 headings.append(f"{prefix} {heading_text}") 

163 

164 if headings: 

165 text = '\n\n'.join(headings) + '\n\n' + text 

166 

167 # Add link information if requested 

168 if include_links: 

169 links = [] 

170 for link in soup.find_all('a', href=True): 

171 link_text = link.get_text(strip=True) 

172 href = link['href'] 

173 if link_text and href and not href.startswith('#') and not href.startswith('javascript:'): 

174 links.append(f"- [{link_text}]({href})") 

175 

176 if links: 

177 text += '\n\n### Links:\n' + '\n'.join(links) 

178 

179 # Normalize whitespace 

180 text = re.sub(r'\n{3,}', '\n\n', text) 

181 

182 # Truncate if necessary 

183 if len(text) > max_length: 

184 text = text[:max_length] + "..." 

185 

186 return text 

187 

188 

189def summarize_text(text: str, max_length: int = 1000, preserve_sentences: bool = True) -> str: 

190 """Summarize text by truncation with sentence preservation. 

191  

192 Args: 

193 text: Text to summarize 

194 max_length: Maximum length of summary 

195 preserve_sentences: Whether to preserve complete sentences 

196  

197 Returns: 

198 Summarized text 

199 """ 

200 if len(text) <= max_length: 

201 return text 

202 

203 if preserve_sentences: 

204 # Split text into sentences 

205 sentences = re.split(r'(?<=[.!?])\s+', text) 

206 

207 # Add sentences until we reach the max length 

208 summary = "" 

209 for sentence in sentences: 

210 if len(summary) + len(sentence) + 1 <= max_length: 

211 summary += sentence + " " 

212 else: 

213 break 

214 

215 return summary.strip() 

216 else: 

217 # Simple truncation 

218 return text[:max_length] + "..." 

219 

220 

221def wrap_text_block(text: str, width: int = 80, prefix: str = "") -> str: 

222 """Wrap text to a specified width with optional prefix. 

223  

224 Args: 

225 text: Text to wrap 

226 width: Line width 

227 prefix: Prefix to add to each line 

228  

229 Returns: 

230 Wrapped text 

231 """ 

232 wrapper = textwrap.TextWrapper( 

233 width=width, 

234 initial_indent=prefix, 

235 subsequent_indent=prefix 

236 ) 

237 

238 lines = text.split('\n') 

239 wrapped_lines = [wrapper.fill(line) if line.strip() else '' for line in lines] 

240 

241 return '\n'.join(wrapped_lines)