Coverage for src/mcp_atlassian/confluence.py: 100%

77 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-02-22 16:34 +0900

1import logging 

2import os 

3 

4from atlassian import Confluence 

5 

6from .config import ConfluenceConfig 

7from .document_types import Document 

8from .preprocessing import TextPreprocessor 

9 

10# Configure logging 

11logger = logging.getLogger("mcp-atlassian") 

12 

13 

14class ConfluenceFetcher: 

15 """Handles fetching and parsing content from Confluence.""" 

16 

17 def __init__(self): 

18 url = os.getenv("CONFLUENCE_URL") 

19 username = os.getenv("CONFLUENCE_USERNAME") 

20 token = os.getenv("CONFLUENCE_API_TOKEN") 

21 

22 if not all([url, username, token]): 

23 raise ValueError("Missing required Confluence environment variables") 

24 

25 self.config = ConfluenceConfig(url=url, username=username, api_token=token) 

26 self.confluence = Confluence( 

27 url=self.config.url, 

28 username=self.config.username, 

29 password=self.config.api_token, # API token is used as password 

30 cloud=True, 

31 ) 

32 self.preprocessor = TextPreprocessor(self.config.url, self.confluence) 

33 

34 def _process_html_content(self, html_content: str, space_key: str) -> tuple[str, str]: 

35 return self.preprocessor.process_html_content(html_content, space_key) 

36 

37 def get_spaces(self, start: int = 0, limit: int = 10): 

38 """Get all available spaces.""" 

39 return self.confluence.get_all_spaces(start=start, limit=limit) 

40 

41 def get_page_content(self, page_id: str, clean_html: bool = True) -> Document: 

42 """Get content of a specific page.""" 

43 page = self.confluence.get_page_by_id(page_id=page_id, expand="body.storage,version,space") 

44 space_key = page.get("space", {}).get("key", "") 

45 

46 content = page["body"]["storage"]["value"] 

47 processed_html, processed_markdown = self._process_html_content(content, space_key) 

48 

49 # Get author information from version 

50 version = page.get("version", {}) 

51 author = version.get("by", {}) 

52 

53 metadata = { 

54 "page_id": page_id, 

55 "title": page["title"], 

56 "version": version.get("number"), 

57 "url": f"{self.config.url}/spaces/{space_key}/pages/{page_id}", 

58 "space_key": space_key, 

59 "author_name": author.get("displayName"), 

60 "space_name": page.get("space", {}).get("name", ""), 

61 "last_modified": version.get("when"), 

62 } 

63 

64 return Document( 

65 page_content=processed_markdown if clean_html else processed_html, 

66 metadata=metadata, 

67 ) 

68 

69 def get_page_by_title(self, space_key: str, title: str, clean_html: bool = True) -> Document | None: 

70 """Get page content by space key and title.""" 

71 try: 

72 page = self.confluence.get_page_by_title(space=space_key, title=title, expand="body.storage,version") 

73 

74 if not page: 

75 return None 

76 

77 content = page["body"]["storage"]["value"] 

78 processed_html, processed_markdown = self._process_html_content(content, space_key) 

79 

80 metadata = { 

81 "page_id": page["id"], 

82 "title": page["title"], 

83 "version": page.get("version", {}).get("number"), 

84 "space_key": space_key, 

85 "url": f"{self.config.url}/spaces/{space_key}/pages/{page['id']}", 

86 } 

87 

88 return Document( 

89 page_content=processed_markdown if clean_html else processed_html, 

90 metadata=metadata, 

91 ) 

92 

93 except Exception as e: 

94 logger.error(f"Error fetching page: {str(e)}") 

95 return None 

96 

97 def get_space_pages( 

98 self, space_key: str, start: int = 0, limit: int = 10, clean_html: bool = True 

99 ) -> list[Document]: 

100 """Get all pages from a specific space.""" 

101 pages = self.confluence.get_all_pages_from_space( 

102 space=space_key, start=start, limit=limit, expand="body.storage" 

103 ) 

104 

105 documents = [] 

106 for page in pages: 

107 content = page["body"]["storage"]["value"] 

108 processed_html, processed_markdown = self._process_html_content(content, space_key) 

109 

110 metadata = { 

111 "page_id": page["id"], 

112 "title": page["title"], 

113 "space_key": space_key, 

114 "version": page.get("version", {}).get("number"), 

115 "url": f"{self.config.url}/spaces/{space_key}/pages/{page['id']}", 

116 } 

117 

118 documents.append( 

119 Document( 

120 page_content=processed_markdown if clean_html else processed_html, 

121 metadata=metadata, 

122 ) 

123 ) 

124 

125 return documents 

126 

127 def get_page_comments(self, page_id: str, clean_html: bool = True) -> list[Document]: 

128 """Get all comments for a specific page.""" 

129 page = self.confluence.get_page_by_id(page_id=page_id, expand="space") 

130 space_key = page.get("space", {}).get("key", "") 

131 space_name = page.get("space", {}).get("name", "") 

132 

133 comments = self.confluence.get_page_comments(content_id=page_id, expand="body.view.value,version", depth="all")[ 

134 "results" 

135 ] 

136 

137 comment_documents = [] 

138 for comment in comments: 

139 body = comment["body"]["view"]["value"] 

140 processed_html, processed_markdown = self._process_html_content(body, space_key) 

141 

142 # Get author information from version.by instead of author 

143 author = comment.get("version", {}).get("by", {}) 

144 

145 metadata = { 

146 "page_id": page_id, 

147 "comment_id": comment["id"], 

148 "last_modified": comment.get("version", {}).get("when"), 

149 "type": "comment", 

150 "author_name": author.get("displayName"), 

151 "space_key": space_key, 

152 "space_name": space_name, 

153 } 

154 

155 comment_documents.append( 

156 Document( 

157 page_content=processed_markdown if clean_html else processed_html, 

158 metadata=metadata, 

159 ) 

160 ) 

161 

162 return comment_documents 

163 

164 def search(self, cql: str, limit: int = 10) -> list[Document]: 

165 """Search content using Confluence Query Language (CQL).""" 

166 try: 

167 results = self.confluence.cql(cql=cql, limit=limit) 

168 documents = [] 

169 

170 for result in results.get("results", []): 

171 content = result.get("content", {}) 

172 if content.get("type") == "page": 

173 metadata = { 

174 "page_id": content["id"], 

175 "title": result["title"], 

176 "space": result.get("resultGlobalContainer", {}).get("title"), 

177 "url": f"{self.config.url}{result['url']}", 

178 "last_modified": result.get("lastModified"), 

179 "type": content["type"], 

180 } 

181 

182 # Use the excerpt as page_content since it's already a good summary 

183 documents.append(Document(page_content=result.get("excerpt", ""), metadata=metadata)) 

184 

185 return documents 

186 except Exception as e: 

187 logger.error(f"Search failed with error: {str(e)}") 

188 return []