Coverage for src/mcp_atlassian/preprocessing.py: 73%

239 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-10 03:26 +0900

1import logging 

2import re 

3import tempfile 

4import warnings 

5from pathlib import Path 

6from typing import Any, Protocol 

7 

8import requests 

9from bs4 import BeautifulSoup, Tag 

10from markdownify import markdownify as md 

11from md2conf.converter import ( 

12 ConfluenceConverterOptions, 

13 ConfluenceStorageFormatConverter, 

14 elements_from_string, 

15 elements_to_string, 

16 markdown_to_html, 

17) 

18 

19logger = logging.getLogger("mcp-atlassian") 

20 

21 

22class ConfluenceClient(Protocol): 

23 """Protocol for Confluence client.""" 

24 

25 def get_user_details_by_accountid(self, account_id: str) -> dict[str, Any]: 

26 """Get user details by account ID.""" 

27 ... 

28 

29 

30class TextPreprocessor: 

31 """Handles text preprocessing for Confluence and Jira content.""" 

32 

33 def __init__( 

34 self, base_url: str, confluence_client: ConfluenceClient | None = None 

35 ) -> None: 

36 """ 

37 Initialize the text preprocessor. 

38 

39 Args: 

40 base_url: Base URL for Confluence or Jira 

41 confluence_client: Optional Confluence client for user lookups 

42 """ 

43 self.base_url = base_url.rstrip("/") 

44 self.confluence_client = confluence_client 

45 

46 def process_html_content( 

47 self, html_content: str, space_key: str = "" 

48 ) -> tuple[str, str]: 

49 """ 

50 Process HTML content to replace user refs and page links. 

51 

52 Args: 

53 html_content: The HTML content to process 

54 space_key: Optional space key for context 

55 

56 Returns: 

57 Tuple of (processed_html, processed_markdown) 

58 """ 

59 try: 

60 # Parse the HTML content 

61 soup = BeautifulSoup(html_content, "html.parser") 

62 

63 # Process user mentions 

64 self._process_user_mentions_in_soup(soup) 

65 

66 # Convert to string and markdown 

67 processed_html = str(soup) 

68 processed_markdown = md(processed_html) 

69 

70 return processed_html, processed_markdown 

71 

72 except Exception as e: 

73 logger.error(f"Error in process_html_content: {str(e)}") 

74 raise 

75 

76 def _process_user_mentions_in_soup(self, soup: BeautifulSoup) -> None: 

77 """ 

78 Process user mentions in BeautifulSoup object. 

79 

80 Args: 

81 soup: BeautifulSoup object containing HTML 

82 """ 

83 # Find all ac:link elements that might contain user mentions 

84 user_mentions = soup.find_all("ac:link") 

85 

86 for user_element in user_mentions: 

87 user_ref = user_element.find("ri:user") 

88 if user_ref and user_ref.get("ri:account-id"): 

89 # Case 1: Direct user reference without link-body 

90 account_id = user_ref.get("ri:account-id") 

91 if isinstance(account_id, str): 

92 self._replace_user_mention(user_element, account_id) 

93 continue 

94 

95 # Case 2: User reference with link-body containing @ 

96 link_body = user_element.find("ac:link-body") 

97 if link_body and "@" in link_body.get_text(strip=True): 

98 user_ref = user_element.find("ri:user") 

99 if user_ref and user_ref.get("ri:account-id"): 

100 account_id = user_ref.get("ri:account-id") 

101 if isinstance(account_id, str): 

102 self._replace_user_mention(user_element, account_id) 

103 

104 def _replace_user_mention(self, user_element: Tag, account_id: str) -> None: 

105 """ 

106 Replace a user mention with the user's display name. 

107 

108 Args: 

109 user_element: The HTML element containing the user mention 

110 account_id: The user's account ID 

111 """ 

112 try: 

113 # Only attempt to get user details if we have a valid confluence client 

114 if self.confluence_client is not None: 

115 user_details = self.confluence_client.get_user_details_by_accountid( 

116 account_id 

117 ) 

118 display_name = user_details.get("displayName", "") 

119 if display_name: 

120 new_text = f"@{display_name}" 

121 user_element.replace_with(new_text) 

122 return 

123 # If we don't have a confluence client or couldn't get user details, 

124 # use fallback 

125 self._use_fallback_user_mention(user_element, account_id) 

126 except KeyError as e: 

127 logger.warning(f"Missing key in user details for {account_id}: {str(e)}") 

128 self._use_fallback_user_mention(user_element, account_id) 

129 except (AttributeError, TypeError) as e: 

130 logger.warning(f"Error parsing user data for {account_id}: {str(e)}") 

131 self._use_fallback_user_mention(user_element, account_id) 

132 except requests.RequestException as e: 

133 logger.warning( 

134 f"Network error fetching user details for {account_id}: {str(e)}" 

135 ) 

136 self._use_fallback_user_mention(user_element, account_id) 

137 except Exception as e: # noqa: BLE001 - Intentional fallback with logging 

138 logger.warning(f"Unexpected error processing user mention: {str(e)}") 

139 logger.debug("Full exception details for user mention:", exc_info=True) 

140 self._use_fallback_user_mention(user_element, account_id) 

141 

142 def _use_fallback_user_mention(self, user_element: Tag, account_id: str) -> None: 

143 """ 

144 Replace user mention with a fallback when the API call fails. 

145 

146 Args: 

147 user_element: The HTML element containing the user mention 

148 account_id: The user's account ID 

149 """ 

150 # Fallback: just use the account ID 

151 new_text = f"@user_{account_id}" 

152 user_element.replace_with(new_text) 

153 

154 def clean_jira_text(self, text: str) -> str: 

155 """ 

156 Clean Jira text content by: 

157 1. Processing user mentions and links 

158 2. Converting Jira markup to markdown 

159 3. Converting HTML/wiki markup to markdown 

160 """ 

161 if not text: 

162 return "" 

163 

164 # Process user mentions 

165 mention_pattern = r"\[~accountid:(.*?)\]" 

166 text = self._process_mentions(text, mention_pattern) 

167 

168 # Process Jira smart links 

169 text = self._process_smart_links(text) 

170 

171 # First convert any Jira markup to Markdown 

172 text = self.jira_to_markdown(text) 

173 

174 # Then convert any remaining HTML to markdown 

175 text = self._convert_html_to_markdown(text) 

176 

177 return text.strip() 

178 

179 def _process_mentions(self, text: str, pattern: str) -> str: 

180 """ 

181 Process user mentions in text. 

182 

183 Args: 

184 text: The text containing mentions 

185 pattern: Regular expression pattern to match mentions 

186 

187 Returns: 

188 Text with mentions replaced with display names 

189 """ 

190 mentions = re.findall(pattern, text) 

191 for account_id in mentions: 

192 try: 

193 # Note: This is a placeholder - actual user fetching should be injected 

194 display_name = f"User:{account_id}" 

195 text = text.replace(f"[~accountid:{account_id}]", display_name) 

196 except (TypeError, ValueError) as e: 

197 logger.error(f"Error formatting mention for {account_id}: {str(e)}") 

198 except re.error as e: 

199 logger.error( 

200 f"Regex error processing mention for {account_id}: {str(e)}" 

201 ) 

202 except Exception as e: # noqa: BLE001 - Intentional fallback with logging 

203 logger.error( 

204 f"Unexpected error processing mention for {account_id}: {str(e)}" 

205 ) 

206 logger.debug( 

207 "Full exception details for mention processing:", exc_info=True 

208 ) 

209 return text 

210 

211 def _process_smart_links(self, text: str) -> str: 

212 """Process Jira/Confluence smart links.""" 

213 # Pattern matches: [text|url|smart-link] 

214 link_pattern = r"\[(.*?)\|(.*?)\|smart-link\]" 

215 matches = re.finditer(link_pattern, text) 

216 

217 for match in matches: 

218 full_match = match.group(0) 

219 link_text = match.group(1) 

220 link_url = match.group(2) 

221 

222 # Extract issue key if it's a Jira issue link 

223 issue_key_match = re.search(r"browse/([A-Z]+-\d+)", link_url) 

224 # Check if it's a Confluence wiki link 

225 confluence_match = re.search( 

226 r"wiki/spaces/.+?/pages/\d+/(.+?)(?:\?|$)", link_url 

227 ) 

228 

229 if issue_key_match: 

230 issue_key = issue_key_match.group(1) 

231 clean_url = f"{self.base_url}/browse/{issue_key}" 

232 text = text.replace(full_match, f"[{issue_key}]({clean_url})") 

233 elif confluence_match: 

234 url_title = confluence_match.group(1) 

235 readable_title = url_title.replace("+", " ") 

236 readable_title = re.sub(r"^[A-Z]+-\d+\s+", "", readable_title) 

237 text = text.replace(full_match, f"[{readable_title}]({link_url})") 

238 else: 

239 clean_url = link_url.split("?")[0] 

240 text = text.replace(full_match, f"[{link_text}]({clean_url})") 

241 

242 return text 

243 

244 def _convert_html_to_markdown(self, text: str) -> str: 

245 """Convert HTML content to markdown if needed.""" 

246 if re.search(r"<[^>]+>", text): 

247 try: 

248 with warnings.catch_warnings(): 

249 warnings.filterwarnings("ignore", category=UserWarning) 

250 soup = BeautifulSoup(f"<div>{text}</div>", "html.parser") 

251 html = str(soup.div.decode_contents()) if soup.div else text 

252 text = md(html) 

253 except (AttributeError, TypeError) as e: 

254 # Handle parsing errors in BeautifulSoup 

255 logger.warning(f"HTML parsing error during conversion to markdown: {e}") 

256 except ImportError as e: 

257 # Handle missing dependencies 

258 logger.warning( 

259 f"Missing dependency for HTML to markdown conversion: {e}" 

260 ) 

261 except (ValueError, NameError) as e: 

262 # Handle value or name errors 

263 logger.warning( 

264 f"Error in values during HTML to markdown conversion: {e}" 

265 ) 

266 except Exception as e: # noqa: BLE001 - Intentional fallback with logging 

267 # Handle other unexpected errors 

268 logger.warning(f"Unexpected error converting HTML to markdown: {e}") 

269 logger.debug( 

270 "Full exception details for HTML conversion:", exc_info=True 

271 ) 

272 return text 

273 

274 def jira_to_markdown(self, input_text: str) -> str: 

275 """ 

276 Convert Jira markup to Markdown format. 

277 

278 Args: 

279 input_text: Text in Jira markup format 

280 

281 Returns: 

282 Text in Markdown format 

283 """ 

284 if not input_text: 

285 return "" 

286 

287 # Block quotes 

288 output = re.sub(r"^bq\.(.*?)$", r"> \1\n", input_text, flags=re.MULTILINE) 

289 

290 # Text formatting (bold, italic) 

291 output = re.sub( 

292 r"([*_])(.*?)\1", 

293 lambda match: ("**" if match.group(1) == "*" else "*") 

294 + match.group(2) 

295 + ("**" if match.group(1) == "*" else "*"), 

296 output, 

297 ) 

298 

299 # Multi-level numbered list 

300 output = re.sub( 

301 r"^((?:#|-|\+|\*)+) (.*)$", 

302 lambda match: self._convert_jira_list_to_markdown(match), 

303 output, 

304 flags=re.MULTILINE, 

305 ) 

306 

307 # Headers 

308 output = re.sub( 

309 r"^h([0-6])\.(.*)$", 

310 lambda match: "#" * int(match.group(1)) + match.group(2), 

311 output, 

312 flags=re.MULTILINE, 

313 ) 

314 

315 # Inline code 

316 output = re.sub(r"\{\{([^}]+)\}\}", r"`\1`", output) 

317 

318 # Citation 

319 output = re.sub(r"\?\?((?:.[^?]|[^?].)+)\?\?", r"<cite>\1</cite>", output) 

320 

321 # Inserted text 

322 output = re.sub(r"\+([^+]*)\+", r"<ins>\1</ins>", output) 

323 

324 # Superscript 

325 output = re.sub(r"\^([^^]*)\^", r"<sup>\1</sup>", output) 

326 

327 # Subscript 

328 output = re.sub(r"~([^~]*)~", r"<sub>\1</sub>", output) 

329 

330 # Strikethrough 

331 output = re.sub(r"-([^-]*)-", r"-\1-", output) 

332 

333 # Code blocks with optional language specification 

334 output = re.sub( 

335 r"\{code(?::([a-z]+))?\}([\s\S]*?)\{code\}", 

336 r"```\1\n\2\n```", 

337 output, 

338 flags=re.MULTILINE, 

339 ) 

340 

341 # No format 

342 output = re.sub(r"\{noformat\}([\s\S]*?)\{noformat\}", r"```\n\1\n```", output) 

343 

344 # Quote blocks 

345 output = re.sub( 

346 r"\{quote\}([\s\S]*)\{quote\}", 

347 lambda match: "\n".join( 

348 [f"> {line}" for line in match.group(1).split("\n")] 

349 ), 

350 output, 

351 flags=re.MULTILINE, 

352 ) 

353 

354 # Images with alt text 

355 output = re.sub( 

356 r"!([^|\n\s]+)\|([^\n!]*)alt=([^\n!\,]+?)(,([^\n!]*))?!", 

357 r"![\3](\1)", 

358 output, 

359 ) 

360 

361 # Images with other parameters (ignore them) 

362 output = re.sub(r"!([^|\n\s]+)\|([^\n!]*)!", r"![](\1)", output) 

363 

364 # Images without parameters 

365 output = re.sub(r"!([^\n\s!]+)!", r"![](\1)", output) 

366 

367 # Links 

368 output = re.sub(r"\[([^|]+)\|(.+?)\]", r"[\1](\2)", output) 

369 output = re.sub(r"\[(.+?)\]([^\(]+)", r"<\1>\2", output) 

370 

371 # Colored text 

372 output = re.sub( 

373 r"\{color:([^}]+)\}([\s\S]*?)\{color\}", 

374 r"<span style=\"color:\1\">\2</span>", 

375 output, 

376 flags=re.MULTILINE, 

377 ) 

378 

379 # Convert Jira table headers (||) to markdown table format 

380 lines = output.split("\n") 

381 i = 0 

382 while i < len(lines): 

383 line = lines[i] 

384 

385 if "||" in line: 

386 # Replace Jira table headers 

387 lines[i] = lines[i].replace("||", "|") 

388 

389 # Add a separator line for markdown tables 

390 header_cells = lines[i].count("|") - 1 

391 if header_cells > 0: 

392 separator_line = "|" + "---|" * header_cells 

393 lines.insert(i + 1, separator_line) 

394 i += 1 # Skip the newly inserted line in next iteration 

395 

396 i += 1 

397 

398 # Rejoin the lines 

399 output = "\n".join(lines) 

400 

401 return output 

402 

403 def markdown_to_jira(self, input_text: str) -> str: 

404 """ 

405 Convert Markdown syntax to Jira markup syntax. 

406 

407 Args: 

408 input_text: Text in Markdown format 

409 

410 Returns: 

411 Text in Jira markup format 

412 """ 

413 if not input_text: 

414 return "" 

415 

416 # Save code blocks to prevent recursive processing 

417 code_blocks = [] 

418 inline_codes = [] 

419 

420 # Extract code blocks 

421 def save_code_block(match: re.Match) -> str: 

422 """ 

423 Process and save a code block. 

424 

425 Args: 

426 match: Regex match object containing the code block 

427 

428 Returns: 

429 Jira-formatted code block 

430 """ 

431 syntax = match.group(1) or "" 

432 content = match.group(2) 

433 code = "{code" 

434 if syntax: 

435 code += ":" + syntax 

436 code += "}" + content + "{code}" 

437 code_blocks.append(code) 

438 return str(code) # Ensure we return a string 

439 

440 # Extract inline code 

441 def save_inline_code(match: re.Match) -> str: 

442 """ 

443 Process and save inline code. 

444 

445 Args: 

446 match: Regex match object containing the inline code 

447 

448 Returns: 

449 Jira-formatted inline code 

450 """ 

451 content = match.group(1) 

452 code = "{{" + content + "}}" 

453 inline_codes.append(code) 

454 return str(code) # Ensure we return a string 

455 

456 # Save code sections temporarily 

457 output = re.sub(r"```(\w*)\n([\s\S]+?)```", save_code_block, input_text) 

458 output = re.sub(r"`([^`]+)`", save_inline_code, output) 

459 

460 # Headers with = or - underlines 

461 output = re.sub( 

462 r"^(.*?)\n([=-])+$", 

463 lambda match: f"h{1 if match.group(2)[0] == '=' else 2}. {match.group(1)}", 

464 output, 

465 flags=re.MULTILINE, 

466 ) 

467 

468 # Headers with # prefix 

469 output = re.sub( 

470 r"^([#]+)(.*?)$", 

471 lambda match: f"h{len(match.group(1))}." + match.group(2), 

472 output, 

473 flags=re.MULTILINE, 

474 ) 

475 

476 # Bold and italic 

477 output = re.sub( 

478 r"([*_]+)(.*?)\1", 

479 lambda match: ("_" if len(match.group(1)) == 1 else "*") 

480 + match.group(2) 

481 + ("_" if len(match.group(1)) == 1 else "*"), 

482 output, 

483 ) 

484 

485 # Multi-level bulleted list 

486 output = re.sub( 

487 r"^(\s*)- (.*)$", 

488 lambda match: "* " + match.group(2) 

489 if not match.group(1) 

490 else " " * (len(match.group(1)) // 2) + "* " + match.group(2), 

491 output, 

492 flags=re.MULTILINE, 

493 ) 

494 

495 # Multi-level numbered list 

496 output = re.sub( 

497 r"^(\s+)1\. (.*)$", 

498 lambda match: "#" * (int(len(match.group(1)) / 4) + 2) 

499 + " " 

500 + match.group(2), 

501 output, 

502 flags=re.MULTILINE, 

503 ) 

504 

505 # HTML formatting tags to Jira markup 

506 tag_map = {"cite": "??", "del": "-", "ins": "+", "sup": "^", "sub": "~"} 

507 

508 for tag, replacement in tag_map.items(): 

509 output = re.sub( 

510 rf"<{tag}>(.*?)<\/{tag}>", rf"{replacement}\1{replacement}", output 

511 ) 

512 

513 # Colored text 

514 output = re.sub( 

515 r"<span style=\"color:(#[^\"]+)\">([\s\S]*?)</span>", 

516 r"{color:\1}\2{color}", 

517 output, 

518 flags=re.MULTILINE, 

519 ) 

520 

521 # Strikethrough 

522 output = re.sub(r"~~(.*?)~~", r"-\1-", output) 

523 

524 # Images without alt text 

525 output = re.sub(r"!\[\]\(([^)\n\s]+)\)", r"!\1!", output) 

526 

527 # Images with alt text 

528 output = re.sub(r"!\[([^\]\n]+)\]\(([^)\n\s]+)\)", r"!\2|alt=\1!", output) 

529 

530 # Links 

531 output = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"[\1|\2]", output) 

532 output = re.sub(r"<([^>]+)>", r"[\1]", output) 

533 

534 # Convert markdown tables to Jira table format 

535 lines = output.split("\n") 

536 i = 0 

537 while i < len(lines): 

538 if i < len(lines) - 1 and re.match(r"\|[-\s|]+\|", lines[i + 1]): 

539 # Convert header row to Jira format 

540 lines[i] = lines[i].replace("|", "||") 

541 # Remove the separator line 

542 lines.pop(i + 1) 

543 i += 1 

544 

545 # Rejoin the lines 

546 output = "\n".join(lines) 

547 

548 return output 

549 

550 def _convert_jira_list_to_markdown(self, match: re.Match) -> str: 

551 """ 

552 Helper method to convert Jira lists to Markdown format. 

553 

554 Args: 

555 match: Regex match object containing the Jira list markup 

556 

557 Returns: 

558 Markdown-formatted list item 

559 """ 

560 jira_bullets = match.group(1) 

561 content = match.group(2) 

562 

563 # Calculate indentation level based on number of symbols 

564 indent_level = len(jira_bullets) - 1 

565 indent = " " * (indent_level * 2) 

566 

567 # Determine the marker based on the last character 

568 last_char = jira_bullets[-1] 

569 prefix = "1." if last_char == "#" else "-" 

570 

571 return f"{indent}{prefix} {content}" 

572 

573 

574def markdown_to_confluence_storage(markdown_content: str) -> str: 

575 """ 

576 Convert Markdown content to Confluence storage format (XHTML) 

577 

578 Args: 

579 markdown_content: Markdown text to convert 

580 

581 Returns: 

582 Confluence storage format (XHTML) string 

583 """ 

584 try: 

585 # First convert markdown to HTML 

586 html_content = markdown_to_html(markdown_content) 

587 

588 # Create a temporary directory for any potential attachments 

589 temp_dir = tempfile.mkdtemp() 

590 

591 try: 

592 # Parse the HTML into an element tree 

593 root = elements_from_string(html_content) 

594 

595 # Create converter options 

596 options = ConfluenceConverterOptions( 

597 ignore_invalid_url=True, heading_anchors=True, render_mermaid=False 

598 ) 

599 

600 # Create a converter 

601 converter = ConfluenceStorageFormatConverter( 

602 options=options, 

603 path=Path(temp_dir) / "temp.md", 

604 root_dir=Path(temp_dir), 

605 page_metadata={}, 

606 ) 

607 

608 # Transform the HTML to Confluence storage format 

609 converter.visit(root) 

610 

611 # Convert the element tree back to a string 

612 storage_format = elements_to_string(root) 

613 

614 return str(storage_format) 

615 finally: 

616 # Clean up the temporary directory 

617 import shutil 

618 

619 shutil.rmtree(temp_dir, ignore_errors=True) 

620 

621 except Exception as e: 

622 logger.error(f"Error converting markdown to Confluence storage format: {e}") 

623 logger.exception(e) 

624 

625 # Fall back to a simpler method if the conversion fails 

626 html_content = markdown_to_html(markdown_content) 

627 

628 # Use a different approach that doesn't rely on the HTML macro 

629 # This creates a proper Confluence storage format document 

630 storage_format = f"""<p>{html_content}</p>""" 

631 

632 return str(storage_format)