Coverage for src/mcp_atlassian/preprocessing.py: 87%

83 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-22 16:34 +0900

1import logging 

2import re 

3import warnings 

4 

5from bs4 import BeautifulSoup 

6from markdownify import markdownify as md 

7 

8logger = logging.getLogger("mcp-atlassian") 

9 

10 

11class TextPreprocessor: 

12 """Handles text preprocessing for Confluence and Jira content.""" 

13 

14 def __init__(self, base_url: str, confluence_client=None): 

15 self.base_url = base_url.rstrip("/") 

16 self.confluence_client = confluence_client 

17 

18 def process_html_content(self, html_content: str, space_key: str = "") -> tuple[str, str]: 

19 """Process HTML content to replace user refs and page links.""" 

20 try: 

21 soup = BeautifulSoup(html_content, "html.parser") 

22 

23 # Process user mentions 

24 user_mentions = soup.find_all("ri:user") 

25 for user in user_mentions: 

26 account_id = user.get("ri:account-id") 

27 if account_id and self.confluence_client: 

28 try: 

29 # Fetch user info using the Confluence API 

30 user_info = self.confluence_client.get_user_details_by_accountid(account_id) 

31 display_name = user_info.get("displayName", account_id) 

32 

33 # Replace the entire ac:link structure with @mention 

34 link_tag = user.find_parent("ac:link") 

35 if link_tag: 

36 link_tag.replace_with(f"@{display_name}") 

37 except Exception as e: 

38 logger.warning(f"Could not fetch user info for {account_id}: {e}") 

39 # Fallback: just use the account ID 

40 link_tag = user.find_parent("ac:link") 

41 if link_tag: 

42 link_tag.replace_with(f"@user_{account_id}") 

43 

44 processed_html = str(soup) 

45 processed_markdown = md(processed_html) 

46 

47 return processed_html, processed_markdown 

48 

49 except Exception as e: 

50 logger.error(f"Error in process_html_content: {str(e)}") 

51 raise 

52 

53 def clean_jira_text(self, text: str) -> str: 

54 """ 

55 Clean Jira text content by: 

56 1. Processing user mentions and links 

57 2. Converting HTML/wiki markup to markdown 

58 """ 

59 if not text: 

60 return "" 

61 

62 # Process user mentions 

63 mention_pattern = r"\[~accountid:(.*?)\]" 

64 text = self._process_mentions(text, mention_pattern) 

65 

66 # Process Jira smart links 

67 text = self._process_smart_links(text) 

68 

69 # Convert HTML to markdown if needed 

70 text = self._convert_html_to_markdown(text) 

71 

72 return text.strip() 

73 

74 def _process_mentions(self, text: str, pattern: str) -> str: 

75 """Process user mentions in text.""" 

76 mentions = re.findall(pattern, text) 

77 for account_id in mentions: 

78 try: 

79 # Note: This is a placeholder - actual user fetching should be injected 

80 display_name = f"User:{account_id}" 

81 text = text.replace(f"[~accountid:{account_id}]", display_name) 

82 except Exception as e: 

83 logger.error(f"Error getting user info for {account_id}: {str(e)}") 

84 return text 

85 

86 def _process_smart_links(self, text: str) -> str: 

87 """Process Jira/Confluence smart links.""" 

88 # Pattern matches: [text|url|smart-link] 

89 link_pattern = r"\[(.*?)\|(.*?)\|smart-link\]" 

90 matches = re.finditer(link_pattern, text) 

91 

92 for match in matches: 

93 full_match = match.group(0) 

94 link_text = match.group(1) 

95 link_url = match.group(2) 

96 

97 # Extract issue key if it's a Jira issue link 

98 issue_key_match = re.search(r"browse/([A-Z]+-\d+)", link_url) 

99 # Check if it's a Confluence wiki link 

100 confluence_match = re.search(r"wiki/spaces/.+?/pages/\d+/(.+?)(?:\?|$)", link_url) 

101 

102 if issue_key_match: 

103 issue_key = issue_key_match.group(1) 

104 clean_url = f"{self.base_url}/browse/{issue_key}" 

105 text = text.replace(full_match, f"[{issue_key}]({clean_url})") 

106 elif confluence_match: 

107 url_title = confluence_match.group(1) 

108 readable_title = url_title.replace("+", " ") 

109 readable_title = re.sub(r"^[A-Z]+-\d+\s+", "", readable_title) 

110 text = text.replace(full_match, f"[{readable_title}]({link_url})") 

111 else: 

112 clean_url = link_url.split("?")[0] 

113 text = text.replace(full_match, f"[{link_text}]({clean_url})") 

114 

115 return text 

116 

117 def _convert_html_to_markdown(self, text: str) -> str: 

118 """Convert HTML content to markdown if needed.""" 

119 if re.search(r"<[^>]+>", text): 

120 try: 

121 with warnings.catch_warnings(): 

122 warnings.filterwarnings("ignore", category=UserWarning) 

123 soup = BeautifulSoup(f"<div>{text}</div>", "html.parser") 

124 html = str(soup.div.decode_contents()) if soup.div else text 

125 text = md(html) 

126 except Exception as e: 

127 logger.warning(f"Error converting HTML to markdown: {e}") 

128 return text