Coverage for src/paperap/plugins/collect_test_data.py: 51%

112 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-12 23:40 -0400

1""" 

2Usage example: 

3 test_dir = Path(__file__).parent.parent.parent.parent / "tests/sample_data" 

4 collector = TestDataCollector(test_dir) 

5 

6---------------------------------------------------------------------------- 

7 

8 METADATA: 

9 

10 File: collect_test_data.py 

11 Project: paperap 

12 Created: 2025-03-04 

13 Version: 0.0.5 

14 Author: Jess Mann 

15 Email: jess@jmann.me 

16 Copyright (c) 2025 Jess Mann 

17 

18---------------------------------------------------------------------------- 

19 

20 LAST MODIFIED: 

21 

22 2025-03-04 By Jess Mann 

23 

24""" 

25 

26from __future__ import annotations 

27 

28import datetime 

29import json 

30import logging 

31import re 

32from decimal import Decimal 

33from pathlib import Path 

34from typing import TYPE_CHECKING, Any, override 

35 

36from faker import Faker 

37 

38from paperap.models import StandardModel 

39from paperap.plugins.base import Plugin 

40from paperap.signals import SignalPriority, registry 

41 

42if TYPE_CHECKING: 

43 from paperap.client import PaperlessClient 

44 

45logger = logging.getLogger(__name__) 

46 

47sanitize_pattern = re.compile(r"[^a-zA-Z0-9_-]") 

48 

49SANITIZE_KEYS = [ 

50 "email", 

51 "first_name", 

52 "last_name", 

53 "name", 

54 "phone", 

55 "username", 

56 "content", 

57 "filename", 

58 "title", 

59 "slug", 

60 "original_file_name", 

61 "archived_file_name", 

62 "task_file_name", 

63 "filename", 

64] 

65 

66 

67class TestDataCollector(Plugin): 

68 """ 

69 Plugin to collect test data from API responses. 

70 """ 

71 

72 name = "test_data_collector" 

73 description = "Collects sample data from API responses for testing purposes" 

74 version = "0.0.2" 

75 fake = Faker() 

76 test_dir: Path 

77 

78 def __init__(self, client: "PaperlessClient", test_dir: Path | None = None, **kwargs: Any) -> None: 

79 # Convert string path to Path object if needed 

80 if test_dir and isinstance(test_dir, str): 

81 test_dir = Path(test_dir) 

82 

83 self.test_dir = test_dir or Path(self.config.get("test_dir", "tests/sample_data")) 

84 self.test_dir.mkdir(parents=True, exist_ok=True) 

85 super().__init__(client, **kwargs) 

86 

87 @override 

88 def setup(self) -> None: 

89 """Register signal handlers.""" 

90 registry.connect("resource._handle_response:after", self.save_list_response, SignalPriority.LOW) 

91 registry.connect("resource._handle_results:before", self.save_first_item, SignalPriority.LOW) 

92 registry.connect("client.request:after", self.save_parsed_response, SignalPriority.LOW) 

93 

94 @override 

95 def teardown(self) -> None: 

96 """Unregister signal handlers.""" 

97 registry.disconnect("resource._handle_response:after", self.save_list_response) 

98 registry.disconnect("resource._handle_results:before", self.save_first_item) 

99 registry.disconnect("client.request:after", self.save_parsed_response) 

100 

101 @staticmethod 

102 def _json_serializer(obj: Any) -> Any: 

103 """Serialize objects that are not natively serializable.""" 

104 if isinstance(obj, datetime.datetime): 

105 return obj.isoformat() 

106 if isinstance(obj, Path): 

107 return str(obj) 

108 if isinstance(obj, Decimal): 

109 return float(obj) 

110 if isinstance(obj, StandardModel): 

111 return obj.to_dict() 

112 if isinstance(obj, StandardModel): 

113 return obj.model_dump() 

114 if isinstance(obj, set): 

115 return list(obj) 

116 if isinstance(obj, bytes): 

117 return obj.decode("utf-8") 

118 raise TypeError(f"Type {type(obj).__name__} is not JSON serializable") 

119 

120 def _sanitize_response(self, **response: dict[str, Any]) -> dict[str, Any]: 

121 """ 

122 Sanitize the response data to replace any strings with potentially personal information with dummy data 

123 """ 

124 sanitized = {} 

125 for key, value in response.items(): 

126 sanitized[key] = self._sanitize_value_recursive(key, value) 

127 

128 # Replace "next" domain using regex 

129 if "next" in response and isinstance(response["next"], str): 

130 sanitized["next"] = re.sub(r"https?://.*?/", "https://example.com/", response["next"]) 

131 

132 return sanitized 

133 

134 def _sanitize_value_recursive(self, key: str, value: Any) -> Any: 

135 """ 

136 Recursively sanitize the value to replace any strings with potentially personal information with dummy data 

137 """ 

138 if isinstance(value, dict): 

139 return {k: self._sanitize_value_recursive(k, v) for k, v in value.items()} 

140 

141 if key in SANITIZE_KEYS: 

142 if isinstance(value, str): 

143 return self.fake.word() 

144 if isinstance(value, list): 

145 return [self.fake.word() for _ in value] 

146 

147 return value 

148 

149 def save_response(self, filepath: Path, response: dict[str, Any], **kwargs: Any) -> None: 

150 """ 

151 Save the response to a JSON file. 

152 """ 

153 if filepath.exists(): 

154 return 

155 

156 try: 

157 response = self._sanitize_response(**response) 

158 with filepath.open("w") as f: 

159 json.dump(response, f, indent=4, sort_keys=True, ensure_ascii=False, default=self._json_serializer) 

160 except (TypeError, OverflowError, OSError) as e: 

161 # Don't allow the plugin to interfere with normal operations in the event of failure 

162 logger.error("Error saving response to file: %s", e) 

163 

164 def save_list_response(self, sender: Any, response: dict[str, Any], **kwargs: Any) -> dict[str, Any]: 

165 """Save the list response to a JSON file.""" 

166 if not response or not (resource_name := kwargs.get("resource")): 

167 return response 

168 

169 filepath = self.test_dir / f"{resource_name}_list.json" 

170 self.save_response(filepath, response) 

171 

172 return response 

173 

174 def save_first_item(self, sender: Any, item: dict[str, Any], **kwargs: Any) -> dict[str, Any]: 

175 """Save the first item from a list to a JSON file.""" 

176 resource_name = kwargs.get("resource") 

177 if not resource_name: 

178 return item 

179 

180 filepath = self.test_dir / f"{resource_name}_item.json" 

181 self.save_response(filepath, item) 

182 

183 # Disable this handler after saving the first item 

184 registry.disable("resource._handle_results:before", self.save_first_item) 

185 

186 return item 

187 

188 def save_parsed_response( 

189 self, 

190 parsed_response: dict[str, Any], 

191 method: str, 

192 params: dict[str, Any] | None, 

193 json_response: bool, 

194 endpoint: str, 

195 **kwargs: Any, 

196 ) -> dict[str, Any]: 

197 """ 

198 Save the request data to a JSON file. 

199 

200 Connects to client.request:after signal. 

201 """ 

202 if not json_response or not params: 

203 return parsed_response 

204 

205 # Strip url to final path segment 

206 resource_name = ".".join(endpoint.split("/")[-2:]) 

207 resource_name = sanitize_pattern.sub("_", resource_name) 

208 

209 combined_params = list(params.keys()) 

210 params_str = "|".join(combined_params) 

211 params_str = sanitize_pattern.sub("_", params_str) 

212 filename_prefix = "" 

213 if method.lower() != "get": 

214 filename_prefix = f"{method.lower()}__" 

215 filename = f"{filename_prefix}{resource_name}__{params_str}.json" 

216 

217 filepath = self.test_dir / filename 

218 self.save_response(filepath, parsed_response) 

219 

220 return parsed_response 

221 

222 @override 

223 @classmethod 

224 def get_config_schema(cls) -> dict[str, Any]: 

225 """Define the configuration schema for this plugin.""" 

226 return { 

227 "test_dir": { 

228 "type": "string", 

229 "description": "Directory to save test data files", 

230 "required": False, 

231 } 

232 }