Coverage for src/paperap/plugins/collect_test_data.py: 98%

124 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-20 13:17 -0400

1""" 

2 

3---------------------------------------------------------------------------- 

4 

5 METADATA: 

6 

7 File: collect_test_data.py 

8 Project: paperap 

9 Created: 2025-03-04 

10 Version: 0.0.8 

11 Author: Jess Mann 

12 Email: jess@jmann.me 

13 Copyright (c) 2025 Jess Mann 

14 

15---------------------------------------------------------------------------- 

16 

17 LAST MODIFIED: 

18 

19 2025-03-04 By Jess Mann 

20 

21""" 

22 

23from __future__ import annotations 

24 

25import datetime 

26import json 

27import logging 

28import re 

29from decimal import Decimal 

30from pathlib import Path 

31from typing import TYPE_CHECKING, Any, override 

32 

33from faker import Faker 

34from pydantic import field_validator 

35 

36from paperap.exceptions import ModelValidationError 

37from paperap.models import StandardModel 

38from paperap.plugins.base import Plugin 

39from paperap.signals import SignalPriority, registry 

40 

41logger = logging.getLogger(__name__) 

42 

43sanitize_pattern = re.compile(r"[^a-zA-Z0-9|.=_-]") 

44 

45SANITIZE_KEYS = [ 

46 "email", 

47 "first_name", 

48 "last_name", 

49 "name", 

50 "phone", 

51 "username", 

52 "content", 

53 "filename", 

54 "title", 

55 "slug", 

56 "original_file_name", 

57 "archived_file_name", 

58 "task_file_name", 

59 "filename", 

60] 

61 

62 

63class SampleDataCollector(Plugin): 

64 """ 

65 Plugin to collect test data from API responses. 

66 """ 

67 

68 name = "test_data_collector" 

69 description = "Collects sample data from API responses for testing purposes" 

70 version = "0.0.3" 

71 fake: Faker = Faker() 

72 test_dir: Path = Path("tests/sample_data") 

73 

74 @field_validator("test_dir", mode="before") 

75 @classmethod 

76 def validate_test_dir(cls, value: Any) -> Path | None: 

77 """Validate the test directory path.""" 

78 # Convert string path to Path object if needed 

79 if not value: 

80 value = Path("tests/sample_data") 

81 

82 if isinstance(value, str): 

83 value = Path(value) 

84 

85 if not isinstance(value, Path): 

86 raise ModelValidationError("Test directory must be a string or Path object") 

87 

88 if not value.is_absolute(): 

89 # Make it relative to project root 

90 project_root = Path(__file__).parents[4] 

91 value = project_root / value 

92 

93 value.mkdir(parents=True, exist_ok=True) 

94 return value 

95 

96 @override 

97 def setup(self) -> None: 

98 """Register signal handlers.""" 

99 registry.connect("resource._handle_response:after", self.save_list_response, SignalPriority.LOW) 

100 registry.connect("resource._handle_results:before", self.save_first_item, SignalPriority.LOW) 

101 registry.connect("client.request:after", self.save_parsed_response, SignalPriority.LOW) 

102 

103 @override 

104 def teardown(self) -> None: 

105 """Unregister signal handlers.""" 

106 registry.disconnect("resource._handle_response:after", self.save_list_response) 

107 registry.disconnect("resource._handle_results:before", self.save_first_item) 

108 registry.disconnect("client.request:after", self.save_parsed_response) 

109 

110 @staticmethod 

111 def _json_serializer(obj: Any) -> Any: 

112 """Serialize objects that are not natively serializable.""" 

113 if isinstance(obj, datetime.datetime): 

114 return obj.isoformat() 

115 if isinstance(obj, Path): 

116 return str(obj) 

117 if isinstance(obj, Decimal): 

118 return float(obj) 

119 if isinstance(obj, StandardModel): 

120 return obj.to_dict() 

121 if isinstance(obj, StandardModel): 

122 return obj.model_dump() 

123 if isinstance(obj, set): 

124 return list(obj) 

125 if isinstance(obj, bytes): 

126 return obj.decode("utf-8") 

127 raise TypeError(f"Type {type(obj).__name__} is not JSON serializable") 

128 

129 def _sanitize_response(self, **response: dict[str, Any]) -> dict[str, Any]: 

130 """ 

131 Sanitize the response data to replace any strings with potentially personal information with dummy data 

132 """ 

133 sanitized = {} 

134 for key, value in response.items(): 

135 sanitized[key] = self._sanitize_value_recursive(key, value) 

136 

137 # Replace "next" domain using regex 

138 if (next_page := response.get("next", None)) and isinstance(next_page, str): 

139 sanitized["next"] = re.sub(r"https?://.*?/", "https://example.com/", next_page) 

140 

141 return sanitized 

142 

143 def _sanitize_value_recursive(self, key: str, value: Any) -> Any: 

144 """ 

145 Recursively sanitize the value to replace any strings with potentially personal information with dummy data 

146 """ 

147 if isinstance(value, dict): 

148 return {k: self._sanitize_value_recursive(k, v) for k, v in value.items()} 

149 

150 if key in SANITIZE_KEYS: 

151 if isinstance(value, str): 

152 return self.fake.word() 

153 if isinstance(value, list): 

154 return [self.fake.word() for _ in value] 

155 

156 return value 

157 

158 def save_response(self, filepath: Path, response: dict[str, Any], **kwargs: Any) -> None: 

159 """ 

160 Save the response to a JSON file. 

161 """ 

162 if filepath.exists(): 

163 return 

164 

165 try: 

166 response = self._sanitize_response(**response) 

167 filepath.parent.mkdir(parents=True, exist_ok=True) 

168 with filepath.open("w") as f: 

169 json.dump(response, f, indent=4, sort_keys=True, ensure_ascii=False, default=self._json_serializer) 

170 except (TypeError, OverflowError, OSError) as e: 

171 # Don't allow the plugin to interfere with normal operations in the event of failure 

172 logger.error("Error saving response to file (%s): %s", filepath.absolute(), e) 

173 

174 def save_list_response(self, sender: Any, response: dict[str, Any] | None, **kwargs: Any) -> dict[str, Any] | None: 

175 """Save the list response to a JSON file.""" 

176 if not response or not (resource_name := kwargs.get("resource")): 

177 return response 

178 

179 filepath = self.test_dir / f"{resource_name}_list.json" 

180 self.save_response(filepath, response) 

181 

182 return response 

183 

184 def save_first_item(self, sender: Any, item: dict[str, Any], **kwargs: Any) -> dict[str, Any]: 

185 """Save the first item from a list to a JSON file.""" 

186 resource_name = kwargs.get("resource") 

187 if not resource_name: 

188 return item 

189 

190 filepath = self.test_dir / f"{resource_name}_item.json" 

191 self.save_response(filepath, item) 

192 

193 # Disable this handler after saving the first item 

194 registry.disable("resource._handle_results:before", self.save_first_item) 

195 

196 return item 

197 

198 def save_parsed_response( 

199 self, 

200 parsed_response: dict[str, Any], 

201 method: str, 

202 params: dict[str, Any] | None, 

203 json_response: bool, 

204 endpoint: str, 

205 **kwargs: Any, 

206 ) -> dict[str, Any]: 

207 """ 

208 Save the request data to a JSON file. 

209 

210 Connects to client.request:after signal. 

211 """ 

212 # If endpoint contains "example.com", we're testing, so skip it 

213 if "example.com" in str(endpoint): 

214 return parsed_response 

215 

216 if not json_response or not params: 

217 return parsed_response 

218 

219 # Strip url to final path segment 

220 resource_name = ".".join(endpoint.split("/")[-2:]) 

221 

222 combined_params = list(f"{k}={v}" for k, v in params.items()) 

223 params_str = "|".join(combined_params) 

224 filename_prefix = "" 

225 if method.lower() != "get": 

226 filename_prefix = f"{method.lower()}__" 

227 filename = f"{filename_prefix}{resource_name}__{params_str}.json" 

228 filename = sanitize_pattern.sub("_", filename) 

229 

230 filepath = self.test_dir / filename 

231 self.save_response(filepath, parsed_response) 

232 

233 return parsed_response 

234 

235 @override 

236 @classmethod 

237 def get_config_schema(cls) -> dict[str, Any]: 

238 """Define the configuration schema for this plugin.""" 

239 return { 

240 "test_dir": { 

241 "type": str, 

242 "description": "Directory to save test data files", 

243 "required": False, 

244 } 

245 }