Source code for paperap.plugins.collect_test_data

"""

----------------------------------------------------------------------------

   METADATA:

       File:    collect_test_data.py
        Project: paperap
       Created: 2025-03-04
        Version: 0.0.10
       Author:  Jess Mann
       Email:   jess@jmann.me
        Copyright (c) 2025 Jess Mann

----------------------------------------------------------------------------

   LAST MODIFIED:

       2025-03-04     By Jess Mann

"""

from __future__ import annotations

import datetime
import json
import logging
import re
from decimal import Decimal
from pathlib import Path
from typing import TYPE_CHECKING, Any, override

from faker import Faker
from pydantic import HttpUrl, field_validator

from paperap.exceptions import ModelValidationError
from paperap.models import StandardModel
from paperap.plugins.base import Plugin
from paperap.signals import SignalPriority, registry

logger = logging.getLogger(__name__)

sanitize_pattern = re.compile(r"[^a-zA-Z0-9|.=_-]")

SANITIZE_KEYS = [
    "email",
    "first_name",
    "last_name",
    "name",
    "phone",
    "username",
    "content",
    "filename",
    "title",
    "slug",
    "original_filename",
    "archived_file_name",
    "task_file_name",
    "filename",
]

type ClientResponse = dict[str, Any] | list[dict[str, Any]]


[docs] class SampleDataCollector(Plugin): """ Plugin to collect test data from API responses. """ name = "test_data_collector" description = "Collects sample data from API responses for testing purposes" version = "0.0.3" fake: Faker = Faker() test_dir: Path = Path("tests/sample_data")
[docs] @field_validator("test_dir", mode="before") @classmethod def validate_test_dir(cls, value: Any) -> Path | None: """Validate the test directory path.""" # Convert string path to Path object if needed if not value: value = Path("tests/sample_data") if isinstance(value, str): value = Path(value) if not isinstance(value, Path): raise ModelValidationError("Test directory must be a string or Path object") if not value.is_absolute(): # Make it relative to project root project_root = Path(__file__).parents[4] value = project_root / value value.mkdir(parents=True, exist_ok=True) return value
[docs] @override def setup(self) -> None: """Register signal handlers.""" registry.connect("resource._handle_response:after", self.save_list_response, SignalPriority.LOW) registry.connect("resource._handle_results:before", self.save_first_item, SignalPriority.LOW) registry.connect("client.request:after", self.save_parsed_response, SignalPriority.LOW)
[docs] @override def teardown(self) -> None: """Unregister signal handlers.""" registry.disconnect("resource._handle_response:after", self.save_list_response) registry.disconnect("resource._handle_results:before", self.save_first_item) registry.disconnect("client.request:after", self.save_parsed_response)
@staticmethod def _json_serializer(obj: Any) -> Any: """Serialize objects that are not natively serializable.""" if isinstance(obj, datetime.datetime): return obj.isoformat() if isinstance(obj, Path): return str(obj) if isinstance(obj, Decimal): return float(obj) if isinstance(obj, StandardModel): return obj.to_dict() if isinstance(obj, StandardModel): return obj.model_dump() if isinstance(obj, set): return list(obj) if isinstance(obj, bytes): return obj.decode("utf-8") raise TypeError(f"Type {type(obj).__name__} is not JSON serializable") def _sanitize_list_response[R: list[dict[str, Any]]](self, response: R) -> R: """ Sanitize the response data to replace any strings with potentially personal information with dummy data """ sanitized_list: R = [] # type: ignore for item in response: sanitized_item = self._sanitize_value_recursive("", item) sanitized_list.append(sanitized_item) # type: ignore return sanitized_list def _sanitize_dict_response[R: dict[str, Any]](self, **response: R) -> R: """ Sanitize the response data to replace any strings with potentially personal information with dummy data """ sanitized: dict[str, Any] = {} for key, value in response.items(): sanitized[key] = self._sanitize_value_recursive(key, value) # Replace "next" domain using regex if (next_page := response.get("next", None)) and isinstance(next_page, str): sanitized["next"] = re.sub(r"https?://.*?/", "https://example.com/", next_page) return sanitized # type: ignore def _sanitize_value_recursive(self, key: str, value: Any) -> Any: """ Recursively sanitize the value to replace any strings with potentially personal information with dummy data """ if isinstance(value, dict): return {k: self._sanitize_value_recursive(k, v) for k, v in value.items()} if key in SANITIZE_KEYS: if isinstance(value, str): return self.fake.word() if isinstance(value, list): return [self.fake.word() for _ in value] return value
[docs] def save_response(self, filepath: Path, response: ClientResponse | None, **kwargs: Any) -> None: """ Save the response to a JSON file. """ if not response or filepath.exists(): return try: if isinstance(response, list): response = self._sanitize_list_response(response) else: response = self._sanitize_dict_response(**response) filepath.parent.mkdir(parents=True, exist_ok=True) with filepath.open("w") as f: json.dump(response, f, indent=4, sort_keys=True, ensure_ascii=False, default=self._json_serializer) except (TypeError, OverflowError, OSError) as e: # Don't allow the plugin to interfere with normal operations in the event of failure logger.error("Error saving response to file (%s): %s", filepath.absolute(), e)
[docs] def save_list_response[R: ClientResponse | None](self, sender: Any, response: R, **kwargs: Any) -> R: """Save the list response to a JSON file.""" if not response or not (resource_name := kwargs.get("resource")): return response filepath = self.test_dir / f"{resource_name}_list.json" self.save_response(filepath, response) return response
[docs] def save_first_item[R: dict[str, Any]](self, sender: Any, item: R, **kwargs: Any) -> R: """Save the first item from a list to a JSON file.""" resource_name = kwargs.get("resource") if not resource_name: return item filepath = self.test_dir / f"{resource_name}_item.json" self.save_response(filepath, item) # Disable this handler after saving the first item registry.disable("resource._handle_results:before", self.save_first_item) return item
[docs] def save_parsed_response( self, parsed_response: dict[str, Any], method: str, params: dict[str, Any] | None, json_response: bool, endpoint: str | HttpUrl, **kwargs: Any, ) -> dict[str, Any]: """ Save the request data to a JSON file. Connects to client.request:after signal. """ if not endpoint: raise ValueError("Endpoint is required to save parsed response") endpoint = str(endpoint) # If endpoint contains "example.com", we're testing, so skip it if "example.com" in str(endpoint): return parsed_response if not json_response or not params: return parsed_response # Strip url to final path segment resource_name = ".".join(endpoint.split("/")[-2:]) combined_params = list(f"{k}={v}" for k, v in params.items()) params_str = "|".join(combined_params) filename_prefix = "" if method.lower() != "get": filename_prefix = f"{method.lower()}__" filename = f"{filename_prefix}{resource_name}__{params_str}.json" filename = sanitize_pattern.sub("_", filename) filename = filename[:100] # Limit filename length filepath = self.test_dir / filename self.save_response(filepath, parsed_response) return parsed_response
[docs] @override @classmethod def get_config_schema(cls) -> dict[str, Any]: """Define the configuration schema for this plugin.""" return { "test_dir": { "type": str, "description": "Directory to save test data files", "required": False, } }