Coverage for src/paperap/plugins/collect_test_data.py: 98%
124 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-20 13:17 -0400
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-20 13:17 -0400
1"""
3----------------------------------------------------------------------------
5 METADATA:
7 File: collect_test_data.py
8 Project: paperap
9 Created: 2025-03-04
10 Version: 0.0.8
11 Author: Jess Mann
12 Email: jess@jmann.me
13 Copyright (c) 2025 Jess Mann
15----------------------------------------------------------------------------
17 LAST MODIFIED:
19 2025-03-04 By Jess Mann
21"""
23from __future__ import annotations
25import datetime
26import json
27import logging
28import re
29from decimal import Decimal
30from pathlib import Path
31from typing import TYPE_CHECKING, Any, override
33from faker import Faker
34from pydantic import field_validator
36from paperap.exceptions import ModelValidationError
37from paperap.models import StandardModel
38from paperap.plugins.base import Plugin
39from paperap.signals import SignalPriority, registry
41logger = logging.getLogger(__name__)
43sanitize_pattern = re.compile(r"[^a-zA-Z0-9|.=_-]")
45SANITIZE_KEYS = [
46 "email",
47 "first_name",
48 "last_name",
49 "name",
50 "phone",
51 "username",
52 "content",
53 "filename",
54 "title",
55 "slug",
56 "original_file_name",
57 "archived_file_name",
58 "task_file_name",
59 "filename",
60]
63class SampleDataCollector(Plugin):
64 """
65 Plugin to collect test data from API responses.
66 """
68 name = "test_data_collector"
69 description = "Collects sample data from API responses for testing purposes"
70 version = "0.0.3"
71 fake: Faker = Faker()
72 test_dir: Path = Path("tests/sample_data")
74 @field_validator("test_dir", mode="before")
75 @classmethod
76 def validate_test_dir(cls, value: Any) -> Path | None:
77 """Validate the test directory path."""
78 # Convert string path to Path object if needed
79 if not value:
80 value = Path("tests/sample_data")
82 if isinstance(value, str):
83 value = Path(value)
85 if not isinstance(value, Path):
86 raise ModelValidationError("Test directory must be a string or Path object")
88 if not value.is_absolute():
89 # Make it relative to project root
90 project_root = Path(__file__).parents[4]
91 value = project_root / value
93 value.mkdir(parents=True, exist_ok=True)
94 return value
96 @override
97 def setup(self) -> None:
98 """Register signal handlers."""
99 registry.connect("resource._handle_response:after", self.save_list_response, SignalPriority.LOW)
100 registry.connect("resource._handle_results:before", self.save_first_item, SignalPriority.LOW)
101 registry.connect("client.request:after", self.save_parsed_response, SignalPriority.LOW)
103 @override
104 def teardown(self) -> None:
105 """Unregister signal handlers."""
106 registry.disconnect("resource._handle_response:after", self.save_list_response)
107 registry.disconnect("resource._handle_results:before", self.save_first_item)
108 registry.disconnect("client.request:after", self.save_parsed_response)
110 @staticmethod
111 def _json_serializer(obj: Any) -> Any:
112 """Serialize objects that are not natively serializable."""
113 if isinstance(obj, datetime.datetime):
114 return obj.isoformat()
115 if isinstance(obj, Path):
116 return str(obj)
117 if isinstance(obj, Decimal):
118 return float(obj)
119 if isinstance(obj, StandardModel):
120 return obj.to_dict()
121 if isinstance(obj, StandardModel):
122 return obj.model_dump()
123 if isinstance(obj, set):
124 return list(obj)
125 if isinstance(obj, bytes):
126 return obj.decode("utf-8")
127 raise TypeError(f"Type {type(obj).__name__} is not JSON serializable")
129 def _sanitize_response(self, **response: dict[str, Any]) -> dict[str, Any]:
130 """
131 Sanitize the response data to replace any strings with potentially personal information with dummy data
132 """
133 sanitized = {}
134 for key, value in response.items():
135 sanitized[key] = self._sanitize_value_recursive(key, value)
137 # Replace "next" domain using regex
138 if (next_page := response.get("next", None)) and isinstance(next_page, str):
139 sanitized["next"] = re.sub(r"https?://.*?/", "https://example.com/", next_page)
141 return sanitized
143 def _sanitize_value_recursive(self, key: str, value: Any) -> Any:
144 """
145 Recursively sanitize the value to replace any strings with potentially personal information with dummy data
146 """
147 if isinstance(value, dict):
148 return {k: self._sanitize_value_recursive(k, v) for k, v in value.items()}
150 if key in SANITIZE_KEYS:
151 if isinstance(value, str):
152 return self.fake.word()
153 if isinstance(value, list):
154 return [self.fake.word() for _ in value]
156 return value
158 def save_response(self, filepath: Path, response: dict[str, Any], **kwargs: Any) -> None:
159 """
160 Save the response to a JSON file.
161 """
162 if filepath.exists():
163 return
165 try:
166 response = self._sanitize_response(**response)
167 filepath.parent.mkdir(parents=True, exist_ok=True)
168 with filepath.open("w") as f:
169 json.dump(response, f, indent=4, sort_keys=True, ensure_ascii=False, default=self._json_serializer)
170 except (TypeError, OverflowError, OSError) as e:
171 # Don't allow the plugin to interfere with normal operations in the event of failure
172 logger.error("Error saving response to file (%s): %s", filepath.absolute(), e)
174 def save_list_response(self, sender: Any, response: dict[str, Any] | None, **kwargs: Any) -> dict[str, Any] | None:
175 """Save the list response to a JSON file."""
176 if not response or not (resource_name := kwargs.get("resource")):
177 return response
179 filepath = self.test_dir / f"{resource_name}_list.json"
180 self.save_response(filepath, response)
182 return response
184 def save_first_item(self, sender: Any, item: dict[str, Any], **kwargs: Any) -> dict[str, Any]:
185 """Save the first item from a list to a JSON file."""
186 resource_name = kwargs.get("resource")
187 if not resource_name:
188 return item
190 filepath = self.test_dir / f"{resource_name}_item.json"
191 self.save_response(filepath, item)
193 # Disable this handler after saving the first item
194 registry.disable("resource._handle_results:before", self.save_first_item)
196 return item
198 def save_parsed_response(
199 self,
200 parsed_response: dict[str, Any],
201 method: str,
202 params: dict[str, Any] | None,
203 json_response: bool,
204 endpoint: str,
205 **kwargs: Any,
206 ) -> dict[str, Any]:
207 """
208 Save the request data to a JSON file.
210 Connects to client.request:after signal.
211 """
212 # If endpoint contains "example.com", we're testing, so skip it
213 if "example.com" in str(endpoint):
214 return parsed_response
216 if not json_response or not params:
217 return parsed_response
219 # Strip url to final path segment
220 resource_name = ".".join(endpoint.split("/")[-2:])
222 combined_params = list(f"{k}={v}" for k, v in params.items())
223 params_str = "|".join(combined_params)
224 filename_prefix = ""
225 if method.lower() != "get":
226 filename_prefix = f"{method.lower()}__"
227 filename = f"{filename_prefix}{resource_name}__{params_str}.json"
228 filename = sanitize_pattern.sub("_", filename)
230 filepath = self.test_dir / filename
231 self.save_response(filepath, parsed_response)
233 return parsed_response
235 @override
236 @classmethod
237 def get_config_schema(cls) -> dict[str, Any]:
238 """Define the configuration schema for this plugin."""
239 return {
240 "test_dir": {
241 "type": str,
242 "description": "Directory to save test data files",
243 "required": False,
244 }
245 }