Coverage for src/mcp_atlassian/preprocessing.py: 73%
239 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-10 03:26 +0900
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-10 03:26 +0900
1import logging
2import re
3import tempfile
4import warnings
5from pathlib import Path
6from typing import Any, Protocol
8import requests
9from bs4 import BeautifulSoup, Tag
10from markdownify import markdownify as md
11from md2conf.converter import (
12 ConfluenceConverterOptions,
13 ConfluenceStorageFormatConverter,
14 elements_from_string,
15 elements_to_string,
16 markdown_to_html,
17)
19logger = logging.getLogger("mcp-atlassian")
22class ConfluenceClient(Protocol):
23 """Protocol for Confluence client."""
25 def get_user_details_by_accountid(self, account_id: str) -> dict[str, Any]:
26 """Get user details by account ID."""
27 ...
30class TextPreprocessor:
31 """Handles text preprocessing for Confluence and Jira content."""
33 def __init__(
34 self, base_url: str, confluence_client: ConfluenceClient | None = None
35 ) -> None:
36 """
37 Initialize the text preprocessor.
39 Args:
40 base_url: Base URL for Confluence or Jira
41 confluence_client: Optional Confluence client for user lookups
42 """
43 self.base_url = base_url.rstrip("/")
44 self.confluence_client = confluence_client
46 def process_html_content(
47 self, html_content: str, space_key: str = ""
48 ) -> tuple[str, str]:
49 """
50 Process HTML content to replace user refs and page links.
52 Args:
53 html_content: The HTML content to process
54 space_key: Optional space key for context
56 Returns:
57 Tuple of (processed_html, processed_markdown)
58 """
59 try:
60 # Parse the HTML content
61 soup = BeautifulSoup(html_content, "html.parser")
63 # Process user mentions
64 self._process_user_mentions_in_soup(soup)
66 # Convert to string and markdown
67 processed_html = str(soup)
68 processed_markdown = md(processed_html)
70 return processed_html, processed_markdown
72 except Exception as e:
73 logger.error(f"Error in process_html_content: {str(e)}")
74 raise
76 def _process_user_mentions_in_soup(self, soup: BeautifulSoup) -> None:
77 """
78 Process user mentions in BeautifulSoup object.
80 Args:
81 soup: BeautifulSoup object containing HTML
82 """
83 # Find all ac:link elements that might contain user mentions
84 user_mentions = soup.find_all("ac:link")
86 for user_element in user_mentions:
87 user_ref = user_element.find("ri:user")
88 if user_ref and user_ref.get("ri:account-id"):
89 # Case 1: Direct user reference without link-body
90 account_id = user_ref.get("ri:account-id")
91 if isinstance(account_id, str):
92 self._replace_user_mention(user_element, account_id)
93 continue
95 # Case 2: User reference with link-body containing @
96 link_body = user_element.find("ac:link-body")
97 if link_body and "@" in link_body.get_text(strip=True):
98 user_ref = user_element.find("ri:user")
99 if user_ref and user_ref.get("ri:account-id"):
100 account_id = user_ref.get("ri:account-id")
101 if isinstance(account_id, str):
102 self._replace_user_mention(user_element, account_id)
104 def _replace_user_mention(self, user_element: Tag, account_id: str) -> None:
105 """
106 Replace a user mention with the user's display name.
108 Args:
109 user_element: The HTML element containing the user mention
110 account_id: The user's account ID
111 """
112 try:
113 # Only attempt to get user details if we have a valid confluence client
114 if self.confluence_client is not None:
115 user_details = self.confluence_client.get_user_details_by_accountid(
116 account_id
117 )
118 display_name = user_details.get("displayName", "")
119 if display_name:
120 new_text = f"@{display_name}"
121 user_element.replace_with(new_text)
122 return
123 # If we don't have a confluence client or couldn't get user details,
124 # use fallback
125 self._use_fallback_user_mention(user_element, account_id)
126 except KeyError as e:
127 logger.warning(f"Missing key in user details for {account_id}: {str(e)}")
128 self._use_fallback_user_mention(user_element, account_id)
129 except (AttributeError, TypeError) as e:
130 logger.warning(f"Error parsing user data for {account_id}: {str(e)}")
131 self._use_fallback_user_mention(user_element, account_id)
132 except requests.RequestException as e:
133 logger.warning(
134 f"Network error fetching user details for {account_id}: {str(e)}"
135 )
136 self._use_fallback_user_mention(user_element, account_id)
137 except Exception as e: # noqa: BLE001 - Intentional fallback with logging
138 logger.warning(f"Unexpected error processing user mention: {str(e)}")
139 logger.debug("Full exception details for user mention:", exc_info=True)
140 self._use_fallback_user_mention(user_element, account_id)
142 def _use_fallback_user_mention(self, user_element: Tag, account_id: str) -> None:
143 """
144 Replace user mention with a fallback when the API call fails.
146 Args:
147 user_element: The HTML element containing the user mention
148 account_id: The user's account ID
149 """
150 # Fallback: just use the account ID
151 new_text = f"@user_{account_id}"
152 user_element.replace_with(new_text)
154 def clean_jira_text(self, text: str) -> str:
155 """
156 Clean Jira text content by:
157 1. Processing user mentions and links
158 2. Converting Jira markup to markdown
159 3. Converting HTML/wiki markup to markdown
160 """
161 if not text:
162 return ""
164 # Process user mentions
165 mention_pattern = r"\[~accountid:(.*?)\]"
166 text = self._process_mentions(text, mention_pattern)
168 # Process Jira smart links
169 text = self._process_smart_links(text)
171 # First convert any Jira markup to Markdown
172 text = self.jira_to_markdown(text)
174 # Then convert any remaining HTML to markdown
175 text = self._convert_html_to_markdown(text)
177 return text.strip()
179 def _process_mentions(self, text: str, pattern: str) -> str:
180 """
181 Process user mentions in text.
183 Args:
184 text: The text containing mentions
185 pattern: Regular expression pattern to match mentions
187 Returns:
188 Text with mentions replaced with display names
189 """
190 mentions = re.findall(pattern, text)
191 for account_id in mentions:
192 try:
193 # Note: This is a placeholder - actual user fetching should be injected
194 display_name = f"User:{account_id}"
195 text = text.replace(f"[~accountid:{account_id}]", display_name)
196 except (TypeError, ValueError) as e:
197 logger.error(f"Error formatting mention for {account_id}: {str(e)}")
198 except re.error as e:
199 logger.error(
200 f"Regex error processing mention for {account_id}: {str(e)}"
201 )
202 except Exception as e: # noqa: BLE001 - Intentional fallback with logging
203 logger.error(
204 f"Unexpected error processing mention for {account_id}: {str(e)}"
205 )
206 logger.debug(
207 "Full exception details for mention processing:", exc_info=True
208 )
209 return text
211 def _process_smart_links(self, text: str) -> str:
212 """Process Jira/Confluence smart links."""
213 # Pattern matches: [text|url|smart-link]
214 link_pattern = r"\[(.*?)\|(.*?)\|smart-link\]"
215 matches = re.finditer(link_pattern, text)
217 for match in matches:
218 full_match = match.group(0)
219 link_text = match.group(1)
220 link_url = match.group(2)
222 # Extract issue key if it's a Jira issue link
223 issue_key_match = re.search(r"browse/([A-Z]+-\d+)", link_url)
224 # Check if it's a Confluence wiki link
225 confluence_match = re.search(
226 r"wiki/spaces/.+?/pages/\d+/(.+?)(?:\?|$)", link_url
227 )
229 if issue_key_match:
230 issue_key = issue_key_match.group(1)
231 clean_url = f"{self.base_url}/browse/{issue_key}"
232 text = text.replace(full_match, f"[{issue_key}]({clean_url})")
233 elif confluence_match:
234 url_title = confluence_match.group(1)
235 readable_title = url_title.replace("+", " ")
236 readable_title = re.sub(r"^[A-Z]+-\d+\s+", "", readable_title)
237 text = text.replace(full_match, f"[{readable_title}]({link_url})")
238 else:
239 clean_url = link_url.split("?")[0]
240 text = text.replace(full_match, f"[{link_text}]({clean_url})")
242 return text
244 def _convert_html_to_markdown(self, text: str) -> str:
245 """Convert HTML content to markdown if needed."""
246 if re.search(r"<[^>]+>", text):
247 try:
248 with warnings.catch_warnings():
249 warnings.filterwarnings("ignore", category=UserWarning)
250 soup = BeautifulSoup(f"<div>{text}</div>", "html.parser")
251 html = str(soup.div.decode_contents()) if soup.div else text
252 text = md(html)
253 except (AttributeError, TypeError) as e:
254 # Handle parsing errors in BeautifulSoup
255 logger.warning(f"HTML parsing error during conversion to markdown: {e}")
256 except ImportError as e:
257 # Handle missing dependencies
258 logger.warning(
259 f"Missing dependency for HTML to markdown conversion: {e}"
260 )
261 except (ValueError, NameError) as e:
262 # Handle value or name errors
263 logger.warning(
264 f"Error in values during HTML to markdown conversion: {e}"
265 )
266 except Exception as e: # noqa: BLE001 - Intentional fallback with logging
267 # Handle other unexpected errors
268 logger.warning(f"Unexpected error converting HTML to markdown: {e}")
269 logger.debug(
270 "Full exception details for HTML conversion:", exc_info=True
271 )
272 return text
274 def jira_to_markdown(self, input_text: str) -> str:
275 """
276 Convert Jira markup to Markdown format.
278 Args:
279 input_text: Text in Jira markup format
281 Returns:
282 Text in Markdown format
283 """
284 if not input_text:
285 return ""
287 # Block quotes
288 output = re.sub(r"^bq\.(.*?)$", r"> \1\n", input_text, flags=re.MULTILINE)
290 # Text formatting (bold, italic)
291 output = re.sub(
292 r"([*_])(.*?)\1",
293 lambda match: ("**" if match.group(1) == "*" else "*")
294 + match.group(2)
295 + ("**" if match.group(1) == "*" else "*"),
296 output,
297 )
299 # Multi-level numbered list
300 output = re.sub(
301 r"^((?:#|-|\+|\*)+) (.*)$",
302 lambda match: self._convert_jira_list_to_markdown(match),
303 output,
304 flags=re.MULTILINE,
305 )
307 # Headers
308 output = re.sub(
309 r"^h([0-6])\.(.*)$",
310 lambda match: "#" * int(match.group(1)) + match.group(2),
311 output,
312 flags=re.MULTILINE,
313 )
315 # Inline code
316 output = re.sub(r"\{\{([^}]+)\}\}", r"`\1`", output)
318 # Citation
319 output = re.sub(r"\?\?((?:.[^?]|[^?].)+)\?\?", r"<cite>\1</cite>", output)
321 # Inserted text
322 output = re.sub(r"\+([^+]*)\+", r"<ins>\1</ins>", output)
324 # Superscript
325 output = re.sub(r"\^([^^]*)\^", r"<sup>\1</sup>", output)
327 # Subscript
328 output = re.sub(r"~([^~]*)~", r"<sub>\1</sub>", output)
330 # Strikethrough
331 output = re.sub(r"-([^-]*)-", r"-\1-", output)
333 # Code blocks with optional language specification
334 output = re.sub(
335 r"\{code(?::([a-z]+))?\}([\s\S]*?)\{code\}",
336 r"```\1\n\2\n```",
337 output,
338 flags=re.MULTILINE,
339 )
341 # No format
342 output = re.sub(r"\{noformat\}([\s\S]*?)\{noformat\}", r"```\n\1\n```", output)
344 # Quote blocks
345 output = re.sub(
346 r"\{quote\}([\s\S]*)\{quote\}",
347 lambda match: "\n".join(
348 [f"> {line}" for line in match.group(1).split("\n")]
349 ),
350 output,
351 flags=re.MULTILINE,
352 )
354 # Images with alt text
355 output = re.sub(
356 r"!([^|\n\s]+)\|([^\n!]*)alt=([^\n!\,]+?)(,([^\n!]*))?!",
357 r"",
358 output,
359 )
361 # Images with other parameters (ignore them)
362 output = re.sub(r"!([^|\n\s]+)\|([^\n!]*)!", r"", output)
364 # Images without parameters
365 output = re.sub(r"!([^\n\s!]+)!", r"", output)
367 # Links
368 output = re.sub(r"\[([^|]+)\|(.+?)\]", r"[\1](\2)", output)
369 output = re.sub(r"\[(.+?)\]([^\(]+)", r"<\1>\2", output)
371 # Colored text
372 output = re.sub(
373 r"\{color:([^}]+)\}([\s\S]*?)\{color\}",
374 r"<span style=\"color:\1\">\2</span>",
375 output,
376 flags=re.MULTILINE,
377 )
379 # Convert Jira table headers (||) to markdown table format
380 lines = output.split("\n")
381 i = 0
382 while i < len(lines):
383 line = lines[i]
385 if "||" in line:
386 # Replace Jira table headers
387 lines[i] = lines[i].replace("||", "|")
389 # Add a separator line for markdown tables
390 header_cells = lines[i].count("|") - 1
391 if header_cells > 0:
392 separator_line = "|" + "---|" * header_cells
393 lines.insert(i + 1, separator_line)
394 i += 1 # Skip the newly inserted line in next iteration
396 i += 1
398 # Rejoin the lines
399 output = "\n".join(lines)
401 return output
403 def markdown_to_jira(self, input_text: str) -> str:
404 """
405 Convert Markdown syntax to Jira markup syntax.
407 Args:
408 input_text: Text in Markdown format
410 Returns:
411 Text in Jira markup format
412 """
413 if not input_text:
414 return ""
416 # Save code blocks to prevent recursive processing
417 code_blocks = []
418 inline_codes = []
420 # Extract code blocks
421 def save_code_block(match: re.Match) -> str:
422 """
423 Process and save a code block.
425 Args:
426 match: Regex match object containing the code block
428 Returns:
429 Jira-formatted code block
430 """
431 syntax = match.group(1) or ""
432 content = match.group(2)
433 code = "{code"
434 if syntax:
435 code += ":" + syntax
436 code += "}" + content + "{code}"
437 code_blocks.append(code)
438 return str(code) # Ensure we return a string
440 # Extract inline code
441 def save_inline_code(match: re.Match) -> str:
442 """
443 Process and save inline code.
445 Args:
446 match: Regex match object containing the inline code
448 Returns:
449 Jira-formatted inline code
450 """
451 content = match.group(1)
452 code = "{{" + content + "}}"
453 inline_codes.append(code)
454 return str(code) # Ensure we return a string
456 # Save code sections temporarily
457 output = re.sub(r"```(\w*)\n([\s\S]+?)```", save_code_block, input_text)
458 output = re.sub(r"`([^`]+)`", save_inline_code, output)
460 # Headers with = or - underlines
461 output = re.sub(
462 r"^(.*?)\n([=-])+$",
463 lambda match: f"h{1 if match.group(2)[0] == '=' else 2}. {match.group(1)}",
464 output,
465 flags=re.MULTILINE,
466 )
468 # Headers with # prefix
469 output = re.sub(
470 r"^([#]+)(.*?)$",
471 lambda match: f"h{len(match.group(1))}." + match.group(2),
472 output,
473 flags=re.MULTILINE,
474 )
476 # Bold and italic
477 output = re.sub(
478 r"([*_]+)(.*?)\1",
479 lambda match: ("_" if len(match.group(1)) == 1 else "*")
480 + match.group(2)
481 + ("_" if len(match.group(1)) == 1 else "*"),
482 output,
483 )
485 # Multi-level bulleted list
486 output = re.sub(
487 r"^(\s*)- (.*)$",
488 lambda match: "* " + match.group(2)
489 if not match.group(1)
490 else " " * (len(match.group(1)) // 2) + "* " + match.group(2),
491 output,
492 flags=re.MULTILINE,
493 )
495 # Multi-level numbered list
496 output = re.sub(
497 r"^(\s+)1\. (.*)$",
498 lambda match: "#" * (int(len(match.group(1)) / 4) + 2)
499 + " "
500 + match.group(2),
501 output,
502 flags=re.MULTILINE,
503 )
505 # HTML formatting tags to Jira markup
506 tag_map = {"cite": "??", "del": "-", "ins": "+", "sup": "^", "sub": "~"}
508 for tag, replacement in tag_map.items():
509 output = re.sub(
510 rf"<{tag}>(.*?)<\/{tag}>", rf"{replacement}\1{replacement}", output
511 )
513 # Colored text
514 output = re.sub(
515 r"<span style=\"color:(#[^\"]+)\">([\s\S]*?)</span>",
516 r"{color:\1}\2{color}",
517 output,
518 flags=re.MULTILINE,
519 )
521 # Strikethrough
522 output = re.sub(r"~~(.*?)~~", r"-\1-", output)
524 # Images without alt text
525 output = re.sub(r"!\[\]\(([^)\n\s]+)\)", r"!\1!", output)
527 # Images with alt text
528 output = re.sub(r"!\[([^\]\n]+)\]\(([^)\n\s]+)\)", r"!\2|alt=\1!", output)
530 # Links
531 output = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"[\1|\2]", output)
532 output = re.sub(r"<([^>]+)>", r"[\1]", output)
534 # Convert markdown tables to Jira table format
535 lines = output.split("\n")
536 i = 0
537 while i < len(lines):
538 if i < len(lines) - 1 and re.match(r"\|[-\s|]+\|", lines[i + 1]):
539 # Convert header row to Jira format
540 lines[i] = lines[i].replace("|", "||")
541 # Remove the separator line
542 lines.pop(i + 1)
543 i += 1
545 # Rejoin the lines
546 output = "\n".join(lines)
548 return output
550 def _convert_jira_list_to_markdown(self, match: re.Match) -> str:
551 """
552 Helper method to convert Jira lists to Markdown format.
554 Args:
555 match: Regex match object containing the Jira list markup
557 Returns:
558 Markdown-formatted list item
559 """
560 jira_bullets = match.group(1)
561 content = match.group(2)
563 # Calculate indentation level based on number of symbols
564 indent_level = len(jira_bullets) - 1
565 indent = " " * (indent_level * 2)
567 # Determine the marker based on the last character
568 last_char = jira_bullets[-1]
569 prefix = "1." if last_char == "#" else "-"
571 return f"{indent}{prefix} {content}"
574def markdown_to_confluence_storage(markdown_content: str) -> str:
575 """
576 Convert Markdown content to Confluence storage format (XHTML)
578 Args:
579 markdown_content: Markdown text to convert
581 Returns:
582 Confluence storage format (XHTML) string
583 """
584 try:
585 # First convert markdown to HTML
586 html_content = markdown_to_html(markdown_content)
588 # Create a temporary directory for any potential attachments
589 temp_dir = tempfile.mkdtemp()
591 try:
592 # Parse the HTML into an element tree
593 root = elements_from_string(html_content)
595 # Create converter options
596 options = ConfluenceConverterOptions(
597 ignore_invalid_url=True, heading_anchors=True, render_mermaid=False
598 )
600 # Create a converter
601 converter = ConfluenceStorageFormatConverter(
602 options=options,
603 path=Path(temp_dir) / "temp.md",
604 root_dir=Path(temp_dir),
605 page_metadata={},
606 )
608 # Transform the HTML to Confluence storage format
609 converter.visit(root)
611 # Convert the element tree back to a string
612 storage_format = elements_to_string(root)
614 return str(storage_format)
615 finally:
616 # Clean up the temporary directory
617 import shutil
619 shutil.rmtree(temp_dir, ignore_errors=True)
621 except Exception as e:
622 logger.error(f"Error converting markdown to Confluence storage format: {e}")
623 logger.exception(e)
625 # Fall back to a simpler method if the conversion fails
626 html_content = markdown_to_html(markdown_content)
628 # Use a different approach that doesn't rely on the HTML macro
629 # This creates a proper Confluence storage format document
630 storage_format = f"""<p>{html_content}</p>"""
632 return str(storage_format)