Coverage for src/mcp_atlassian/preprocessing.py: 72%

105 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-08 18:10 +0900

1import logging 

2import re 

3import tempfile 

4import warnings 

5from pathlib import Path 

6 

7from bs4 import BeautifulSoup 

8from markdownify import markdownify as md 

9from md2conf.converter import ( 

10 ConfluenceConverterOptions, 

11 ConfluenceStorageFormatConverter, 

12 elements_from_string, 

13 elements_to_string, 

14 markdown_to_html, 

15) 

16 

17logger = logging.getLogger("mcp-atlassian") 

18 

19 

20class TextPreprocessor: 

21 """Handles text preprocessing for Confluence and Jira content.""" 

22 

23 def __init__(self, base_url: str, confluence_client=None): 

24 self.base_url = base_url.rstrip("/") 

25 self.confluence_client = confluence_client 

26 

27 def process_html_content(self, html_content: str, space_key: str = "") -> tuple[str, str]: 

28 """Process HTML content to replace user refs and page links.""" 

29 try: 

30 soup = BeautifulSoup(html_content, "html.parser") 

31 

32 # Process user mentions 

33 user_mentions = soup.find_all("ri:user") 

34 for user in user_mentions: 

35 account_id = user.get("ri:account-id") 

36 if account_id and self.confluence_client: 

37 try: 

38 # Fetch user info using the Confluence API 

39 user_info = self.confluence_client.get_user_details_by_accountid(account_id) 

40 display_name = user_info.get("displayName", account_id) 

41 

42 # Replace the entire ac:link structure with @mention 

43 link_tag = user.find_parent("ac:link") 

44 if link_tag: 

45 link_tag.replace_with(f"@{display_name}") 

46 except Exception as e: 

47 logger.warning(f"Could not fetch user info for {account_id}: {e}") 

48 # Fallback: just use the account ID 

49 link_tag = user.find_parent("ac:link") 

50 if link_tag: 

51 link_tag.replace_with(f"@user_{account_id}") 

52 

53 processed_html = str(soup) 

54 processed_markdown = md(processed_html) 

55 

56 return processed_html, processed_markdown 

57 

58 except Exception as e: 

59 logger.error(f"Error in process_html_content: {str(e)}") 

60 raise 

61 

62 def clean_jira_text(self, text: str) -> str: 

63 """ 

64 Clean Jira text content by: 

65 1. Processing user mentions and links 

66 2. Converting HTML/wiki markup to markdown 

67 """ 

68 if not text: 

69 return "" 

70 

71 # Process user mentions 

72 mention_pattern = r"\[~accountid:(.*?)\]" 

73 text = self._process_mentions(text, mention_pattern) 

74 

75 # Process Jira smart links 

76 text = self._process_smart_links(text) 

77 

78 # Convert HTML to markdown if needed 

79 text = self._convert_html_to_markdown(text) 

80 

81 return text.strip() 

82 

83 def _process_mentions(self, text: str, pattern: str) -> str: 

84 """Process user mentions in text.""" 

85 mentions = re.findall(pattern, text) 

86 for account_id in mentions: 

87 try: 

88 # Note: This is a placeholder - actual user fetching should be injected 

89 display_name = f"User:{account_id}" 

90 text = text.replace(f"[~accountid:{account_id}]", display_name) 

91 except Exception as e: 

92 logger.error(f"Error getting user info for {account_id}: {str(e)}") 

93 return text 

94 

95 def _process_smart_links(self, text: str) -> str: 

96 """Process Jira/Confluence smart links.""" 

97 # Pattern matches: [text|url|smart-link] 

98 link_pattern = r"\[(.*?)\|(.*?)\|smart-link\]" 

99 matches = re.finditer(link_pattern, text) 

100 

101 for match in matches: 

102 full_match = match.group(0) 

103 link_text = match.group(1) 

104 link_url = match.group(2) 

105 

106 # Extract issue key if it's a Jira issue link 

107 issue_key_match = re.search(r"browse/([A-Z]+-\d+)", link_url) 

108 # Check if it's a Confluence wiki link 

109 confluence_match = re.search(r"wiki/spaces/.+?/pages/\d+/(.+?)(?:\?|$)", link_url) 

110 

111 if issue_key_match: 

112 issue_key = issue_key_match.group(1) 

113 clean_url = f"{self.base_url}/browse/{issue_key}" 

114 text = text.replace(full_match, f"[{issue_key}]({clean_url})") 

115 elif confluence_match: 

116 url_title = confluence_match.group(1) 

117 readable_title = url_title.replace("+", " ") 

118 readable_title = re.sub(r"^[A-Z]+-\d+\s+", "", readable_title) 

119 text = text.replace(full_match, f"[{readable_title}]({link_url})") 

120 else: 

121 clean_url = link_url.split("?")[0] 

122 text = text.replace(full_match, f"[{link_text}]({clean_url})") 

123 

124 return text 

125 

126 def _convert_html_to_markdown(self, text: str) -> str: 

127 """Convert HTML content to markdown if needed.""" 

128 if re.search(r"<[^>]+>", text): 

129 try: 

130 with warnings.catch_warnings(): 

131 warnings.filterwarnings("ignore", category=UserWarning) 

132 soup = BeautifulSoup(f"<div>{text}</div>", "html.parser") 

133 html = str(soup.div.decode_contents()) if soup.div else text 

134 text = md(html) 

135 except Exception as e: 

136 logger.warning(f"Error converting HTML to markdown: {e}") 

137 return text 

138 

139 

140def markdown_to_confluence_storage(markdown_content): 

141 """ 

142 Convert Markdown content to Confluence storage format (XHTML) 

143 

144 This function uses the markdown-to-confluence library to properly convert 

145 Markdown to Confluence storage format. The library handles proper formatting of: 

146 - Headings with anchors 

147 - Text formatting (bold, italic, etc.) 

148 - Lists (ordered and unordered) 

149 - Code blocks with syntax highlighting 

150 - Tables and other Markdown elements 

151 

152 Args: 

153 markdown_content: The markdown content to convert 

154 

155 Returns: 

156 String in Confluence storage format (XHTML with Confluence macros) 

157 """ 

158 try: 

159 # First convert markdown to HTML 

160 html_content = markdown_to_html(markdown_content) 

161 

162 # Create a temporary directory for any potential attachments 

163 temp_dir = tempfile.mkdtemp() 

164 

165 try: 

166 # Parse the HTML into an element tree 

167 root = elements_from_string(html_content) 

168 

169 # Create converter options 

170 options = ConfluenceConverterOptions(ignore_invalid_url=True, heading_anchors=True, render_mermaid=False) 

171 

172 # Create a converter 

173 converter = ConfluenceStorageFormatConverter( 

174 options=options, path=Path(temp_dir) / "temp.md", root_dir=Path(temp_dir), page_metadata={} 

175 ) 

176 

177 # Transform the HTML to Confluence storage format 

178 converter.visit(root) 

179 

180 # Convert the element tree back to a string 

181 storage_format = elements_to_string(root) 

182 

183 return storage_format 

184 finally: 

185 # Clean up the temporary directory 

186 import shutil 

187 

188 shutil.rmtree(temp_dir, ignore_errors=True) 

189 

190 except Exception as e: 

191 logger.error(f"Error converting markdown to Confluence storage format: {e}") 

192 logger.exception(e) 

193 

194 # Fall back to a simpler method if the conversion fails 

195 html_content = markdown_to_html(markdown_content) 

196 

197 # Use a different approach that doesn't rely on the HTML macro 

198 # This creates a proper Confluence storage format document 

199 storage_format = f"""<p>{html_content}</p>""" 

200 

201 return storage_format