Coverage for src/paperap/plugins/collect_test_data.py: 98%

121 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-18 12:26 -0400

1""" 

2 

3---------------------------------------------------------------------------- 

4 

5 METADATA: 

6 

7 File: collect_test_data.py 

8 Project: paperap 

9 Created: 2025-03-04 

10 Version: 0.0.7 

11 Author: Jess Mann 

12 Email: jess@jmann.me 

13 Copyright (c) 2025 Jess Mann 

14 

15---------------------------------------------------------------------------- 

16 

17 LAST MODIFIED: 

18 

19 2025-03-04 By Jess Mann 

20 

21""" 

22 

23from __future__ import annotations 

24 

25import datetime 

26import json 

27import logging 

28import re 

29from decimal import Decimal 

30from pathlib import Path 

31from typing import TYPE_CHECKING, Any, override 

32 

33from faker import Faker 

34from pydantic import field_validator 

35 

36from paperap.exceptions import ModelValidationError 

37from paperap.models import StandardModel 

38from paperap.plugins.base import Plugin 

39from paperap.signals import SignalPriority, registry 

40 

41logger = logging.getLogger(__name__) 

42 

43sanitize_pattern = re.compile(r"[^a-zA-Z0-9|.=_-]") 

44 

45SANITIZE_KEYS = [ 

46 "email", 

47 "first_name", 

48 "last_name", 

49 "name", 

50 "phone", 

51 "username", 

52 "content", 

53 "filename", 

54 "title", 

55 "slug", 

56 "original_file_name", 

57 "archived_file_name", 

58 "task_file_name", 

59 "filename", 

60] 

61 

62 

63class SampleDataCollector(Plugin): 

64 """ 

65 Plugin to collect test data from API responses. 

66 """ 

67 

68 name = "test_data_collector" 

69 description = "Collects sample data from API responses for testing purposes" 

70 version = "0.0.3" 

71 fake: Faker = Faker() 

72 test_dir: Path = Path("tests/sample_data") 

73 

74 @field_validator("test_dir", mode="before") 

75 @classmethod 

76 def validate_test_dir(cls, value: Any) -> Path | None: 

77 """Validate the test directory path.""" 

78 # Convert string path to Path object if needed 

79 if not value: 

80 value = Path("tests/sample_data") 

81 

82 if isinstance(value, str): 

83 value = Path(value) 

84 

85 if not isinstance(value, Path): 

86 raise ModelValidationError("Test directory must be a string or Path object") 

87 

88 value.mkdir(parents=True, exist_ok=True) 

89 return value 

90 

91 @override 

92 def setup(self) -> None: 

93 """Register signal handlers.""" 

94 registry.connect("resource._handle_response:after", self.save_list_response, SignalPriority.LOW) 

95 registry.connect("resource._handle_results:before", self.save_first_item, SignalPriority.LOW) 

96 registry.connect("client.request:after", self.save_parsed_response, SignalPriority.LOW) 

97 

98 @override 

99 def teardown(self) -> None: 

100 """Unregister signal handlers.""" 

101 registry.disconnect("resource._handle_response:after", self.save_list_response) 

102 registry.disconnect("resource._handle_results:before", self.save_first_item) 

103 registry.disconnect("client.request:after", self.save_parsed_response) 

104 

105 @staticmethod 

106 def _json_serializer(obj: Any) -> Any: 

107 """Serialize objects that are not natively serializable.""" 

108 if isinstance(obj, datetime.datetime): 

109 return obj.isoformat() 

110 if isinstance(obj, Path): 

111 return str(obj) 

112 if isinstance(obj, Decimal): 

113 return float(obj) 

114 if isinstance(obj, StandardModel): 

115 return obj.to_dict() 

116 if isinstance(obj, StandardModel): 

117 return obj.model_dump() 

118 if isinstance(obj, set): 

119 return list(obj) 

120 if isinstance(obj, bytes): 

121 return obj.decode("utf-8") 

122 raise TypeError(f"Type {type(obj).__name__} is not JSON serializable") 

123 

124 def _sanitize_response(self, **response: dict[str, Any]) -> dict[str, Any]: 

125 """ 

126 Sanitize the response data to replace any strings with potentially personal information with dummy data 

127 """ 

128 sanitized = {} 

129 for key, value in response.items(): 

130 sanitized[key] = self._sanitize_value_recursive(key, value) 

131 

132 # Replace "next" domain using regex 

133 if "next" in response and isinstance(response["next"], str): 

134 sanitized["next"] = re.sub(r"https?://.*?/", "https://example.com/", response["next"]) 

135 

136 return sanitized 

137 

138 def _sanitize_value_recursive(self, key: str, value: Any) -> Any: 

139 """ 

140 Recursively sanitize the value to replace any strings with potentially personal information with dummy data 

141 """ 

142 if isinstance(value, dict): 

143 return {k: self._sanitize_value_recursive(k, v) for k, v in value.items()} 

144 

145 if key in SANITIZE_KEYS: 

146 if isinstance(value, str): 

147 return self.fake.word() 

148 if isinstance(value, list): 

149 return [self.fake.word() for _ in value] 

150 

151 return value 

152 

153 def save_response(self, filepath: Path, response: dict[str, Any], **kwargs: Any) -> None: 

154 """ 

155 Save the response to a JSON file. 

156 """ 

157 if filepath.exists(): 

158 return 

159 

160 try: 

161 response = self._sanitize_response(**response) 

162 filepath.parent.mkdir(parents=True, exist_ok=True) 

163 with filepath.open("w") as f: 

164 json.dump(response, f, indent=4, sort_keys=True, ensure_ascii=False, default=self._json_serializer) 

165 except (TypeError, OverflowError, OSError) as e: 

166 # Don't allow the plugin to interfere with normal operations in the event of failure 

167 logger.error("Error saving response to file (%s): %s", filepath.absolute(), e) 

168 

169 def save_list_response(self, sender: Any, response: dict[str, Any] | None, **kwargs: Any) -> dict[str, Any] | None: 

170 """Save the list response to a JSON file.""" 

171 if not response or not (resource_name := kwargs.get("resource")): 

172 return response 

173 

174 filepath = self.test_dir / f"{resource_name}_list.json" 

175 self.save_response(filepath, response) 

176 

177 return response 

178 

179 def save_first_item(self, sender: Any, item: dict[str, Any], **kwargs: Any) -> dict[str, Any]: 

180 """Save the first item from a list to a JSON file.""" 

181 resource_name = kwargs.get("resource") 

182 if not resource_name: 

183 return item 

184 

185 filepath = self.test_dir / f"{resource_name}_item.json" 

186 self.save_response(filepath, item) 

187 

188 # Disable this handler after saving the first item 

189 registry.disable("resource._handle_results:before", self.save_first_item) 

190 

191 return item 

192 

193 def save_parsed_response( 

194 self, 

195 parsed_response: dict[str, Any], 

196 method: str, 

197 params: dict[str, Any] | None, 

198 json_response: bool, 

199 endpoint: str, 

200 **kwargs: Any, 

201 ) -> dict[str, Any]: 

202 """ 

203 Save the request data to a JSON file. 

204 

205 Connects to client.request:after signal. 

206 """ 

207 # If endpoint contains "example.com", we're testing, so skip it 

208 if "example.com" in str(endpoint): 

209 return parsed_response 

210 

211 if not json_response or not params: 

212 return parsed_response 

213 

214 # Strip url to final path segment 

215 resource_name = ".".join(endpoint.split("/")[-2:]) 

216 

217 combined_params = list(f"{k}={v}" for k, v in params.items()) 

218 params_str = "|".join(combined_params) 

219 filename_prefix = "" 

220 if method.lower() != "get": 

221 filename_prefix = f"{method.lower()}__" 

222 filename = f"{filename_prefix}{resource_name}__{params_str}.json" 

223 filename = sanitize_pattern.sub("_", filename) 

224 

225 filepath = self.test_dir / filename 

226 self.save_response(filepath, parsed_response) 

227 

228 return parsed_response 

229 

230 @override 

231 @classmethod 

232 def get_config_schema(cls) -> dict[str, Any]: 

233 """Define the configuration schema for this plugin.""" 

234 return { 

235 "test_dir": { 

236 "type": str, 

237 "description": "Directory to save test data files", 

238 "required": False, 

239 } 

240 }