Coverage for src/mcp_atlassian/preprocessing.py: 72%
105 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-08 18:10 +0900
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-08 18:10 +0900
1import logging
2import re
3import tempfile
4import warnings
5from pathlib import Path
7from bs4 import BeautifulSoup
8from markdownify import markdownify as md
9from md2conf.converter import (
10 ConfluenceConverterOptions,
11 ConfluenceStorageFormatConverter,
12 elements_from_string,
13 elements_to_string,
14 markdown_to_html,
15)
17logger = logging.getLogger("mcp-atlassian")
20class TextPreprocessor:
21 """Handles text preprocessing for Confluence and Jira content."""
23 def __init__(self, base_url: str, confluence_client=None):
24 self.base_url = base_url.rstrip("/")
25 self.confluence_client = confluence_client
27 def process_html_content(self, html_content: str, space_key: str = "") -> tuple[str, str]:
28 """Process HTML content to replace user refs and page links."""
29 try:
30 soup = BeautifulSoup(html_content, "html.parser")
32 # Process user mentions
33 user_mentions = soup.find_all("ri:user")
34 for user in user_mentions:
35 account_id = user.get("ri:account-id")
36 if account_id and self.confluence_client:
37 try:
38 # Fetch user info using the Confluence API
39 user_info = self.confluence_client.get_user_details_by_accountid(account_id)
40 display_name = user_info.get("displayName", account_id)
42 # Replace the entire ac:link structure with @mention
43 link_tag = user.find_parent("ac:link")
44 if link_tag:
45 link_tag.replace_with(f"@{display_name}")
46 except Exception as e:
47 logger.warning(f"Could not fetch user info for {account_id}: {e}")
48 # Fallback: just use the account ID
49 link_tag = user.find_parent("ac:link")
50 if link_tag:
51 link_tag.replace_with(f"@user_{account_id}")
53 processed_html = str(soup)
54 processed_markdown = md(processed_html)
56 return processed_html, processed_markdown
58 except Exception as e:
59 logger.error(f"Error in process_html_content: {str(e)}")
60 raise
62 def clean_jira_text(self, text: str) -> str:
63 """
64 Clean Jira text content by:
65 1. Processing user mentions and links
66 2. Converting HTML/wiki markup to markdown
67 """
68 if not text:
69 return ""
71 # Process user mentions
72 mention_pattern = r"\[~accountid:(.*?)\]"
73 text = self._process_mentions(text, mention_pattern)
75 # Process Jira smart links
76 text = self._process_smart_links(text)
78 # Convert HTML to markdown if needed
79 text = self._convert_html_to_markdown(text)
81 return text.strip()
83 def _process_mentions(self, text: str, pattern: str) -> str:
84 """Process user mentions in text."""
85 mentions = re.findall(pattern, text)
86 for account_id in mentions:
87 try:
88 # Note: This is a placeholder - actual user fetching should be injected
89 display_name = f"User:{account_id}"
90 text = text.replace(f"[~accountid:{account_id}]", display_name)
91 except Exception as e:
92 logger.error(f"Error getting user info for {account_id}: {str(e)}")
93 return text
95 def _process_smart_links(self, text: str) -> str:
96 """Process Jira/Confluence smart links."""
97 # Pattern matches: [text|url|smart-link]
98 link_pattern = r"\[(.*?)\|(.*?)\|smart-link\]"
99 matches = re.finditer(link_pattern, text)
101 for match in matches:
102 full_match = match.group(0)
103 link_text = match.group(1)
104 link_url = match.group(2)
106 # Extract issue key if it's a Jira issue link
107 issue_key_match = re.search(r"browse/([A-Z]+-\d+)", link_url)
108 # Check if it's a Confluence wiki link
109 confluence_match = re.search(r"wiki/spaces/.+?/pages/\d+/(.+?)(?:\?|$)", link_url)
111 if issue_key_match:
112 issue_key = issue_key_match.group(1)
113 clean_url = f"{self.base_url}/browse/{issue_key}"
114 text = text.replace(full_match, f"[{issue_key}]({clean_url})")
115 elif confluence_match:
116 url_title = confluence_match.group(1)
117 readable_title = url_title.replace("+", " ")
118 readable_title = re.sub(r"^[A-Z]+-\d+\s+", "", readable_title)
119 text = text.replace(full_match, f"[{readable_title}]({link_url})")
120 else:
121 clean_url = link_url.split("?")[0]
122 text = text.replace(full_match, f"[{link_text}]({clean_url})")
124 return text
126 def _convert_html_to_markdown(self, text: str) -> str:
127 """Convert HTML content to markdown if needed."""
128 if re.search(r"<[^>]+>", text):
129 try:
130 with warnings.catch_warnings():
131 warnings.filterwarnings("ignore", category=UserWarning)
132 soup = BeautifulSoup(f"<div>{text}</div>", "html.parser")
133 html = str(soup.div.decode_contents()) if soup.div else text
134 text = md(html)
135 except Exception as e:
136 logger.warning(f"Error converting HTML to markdown: {e}")
137 return text
140def markdown_to_confluence_storage(markdown_content):
141 """
142 Convert Markdown content to Confluence storage format (XHTML)
144 This function uses the markdown-to-confluence library to properly convert
145 Markdown to Confluence storage format. The library handles proper formatting of:
146 - Headings with anchors
147 - Text formatting (bold, italic, etc.)
148 - Lists (ordered and unordered)
149 - Code blocks with syntax highlighting
150 - Tables and other Markdown elements
152 Args:
153 markdown_content: The markdown content to convert
155 Returns:
156 String in Confluence storage format (XHTML with Confluence macros)
157 """
158 try:
159 # First convert markdown to HTML
160 html_content = markdown_to_html(markdown_content)
162 # Create a temporary directory for any potential attachments
163 temp_dir = tempfile.mkdtemp()
165 try:
166 # Parse the HTML into an element tree
167 root = elements_from_string(html_content)
169 # Create converter options
170 options = ConfluenceConverterOptions(ignore_invalid_url=True, heading_anchors=True, render_mermaid=False)
172 # Create a converter
173 converter = ConfluenceStorageFormatConverter(
174 options=options, path=Path(temp_dir) / "temp.md", root_dir=Path(temp_dir), page_metadata={}
175 )
177 # Transform the HTML to Confluence storage format
178 converter.visit(root)
180 # Convert the element tree back to a string
181 storage_format = elements_to_string(root)
183 return storage_format
184 finally:
185 # Clean up the temporary directory
186 import shutil
188 shutil.rmtree(temp_dir, ignore_errors=True)
190 except Exception as e:
191 logger.error(f"Error converting markdown to Confluence storage format: {e}")
192 logger.exception(e)
194 # Fall back to a simpler method if the conversion fails
195 html_content = markdown_to_html(markdown_content)
197 # Use a different approach that doesn't rely on the HTML macro
198 # This creates a proper Confluence storage format document
199 storage_format = f"""<p>{html_content}</p>"""
201 return storage_format