Coverage for src/paperap/scripts/describe.py: 85%
343 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-20 13:17 -0400
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-20 13:17 -0400
1"""
6----------------------------------------------------------------------------
8METADATA:
10File: describe.py
11 Project: paperap
12Created: 2025-03-18
13 Version: 0.0.8
14Author: Jess Mann
15Email: jess@jmann.me
16 Copyright (c) 2025 Jess Mann
18----------------------------------------------------------------------------
20LAST MODIFIED:
222025-03-18 By Jess Mann
24"""
26from __future__ import annotations
28import argparse
29import base64
30import json
31import logging
32import os
33import re
34import sys
35from datetime import date, datetime
36from enum import StrEnum
37from functools import singledispatchmethod
38from io import BytesIO
39from pathlib import Path
40from typing import Any, Collection, Dict, Iterator, List, Optional, TypeVar, Union, cast
42import dateparser
43import fitz # type: ignore
44import openai
45import openai.types.chat
46import requests
47from alive_progress import alive_bar # type: ignore
48from dotenv import load_dotenv
49from jinja2 import Environment, FileSystemLoader
50from openai import OpenAI
51from PIL import Image, UnidentifiedImageError
52from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, field_validator
54from paperap.client import PaperlessClient
55from paperap.exceptions import DocumentParsingError, NoImagesError
56from paperap.models.document import Document
57from paperap.models.document.queryset import DocumentQuerySet
58from paperap.models.tag import Tag
59from paperap.scripts.utils import ProgressBar, setup_logging
60from paperap.settings import Settings
62logger = logging.getLogger(__name__)
64DESCRIBE_ACCEPTED_FORMATS = ["png", "jpg", "jpeg", "gif", "tif", "tiff", "bmp", "webp", "pdf"]
65OPENAI_ACCEPTED_FORMATS = ["png", "jpg", "jpeg", "gif", "webp", "pdf"]
66MIME_TYPES = {
67 "png": "image/png",
68 "jpeg": "image/jpeg",
69 "jpg": "image/jpeg",
70 "gif": "image/gif",
71 "webp": "image/webp",
72}
75class ScriptDefaults(StrEnum):
76 NEEDS_DESCRIPTION = "needs-description"
77 DESCRIBED = "described"
78 NEEDS_TITLE = "needs-title"
79 NEEDS_DATE = "needs-date"
80 MODEL = "gpt-4o-mini"
83SCRIPT_VERSION = "0.2.2"
86class DescribePhotos(BaseModel):
87 """
88 Describes photos in the Paperless NGX instance using an LLM (such as OpenAI's GPT-4o-mini model).
89 """
91 max_threads: int = 0
92 paperless_tag: str | None = Field(default=ScriptDefaults.NEEDS_DESCRIPTION)
93 prompt: str | None = Field(None)
94 client: PaperlessClient
95 _jinja_env: Environment | None = PrivateAttr(default=None)
96 _progress_bar: ProgressBar | None = PrivateAttr(default=None)
97 _progress_message: str | None = PrivateAttr(default=None)
98 _openai: OpenAI | None = PrivateAttr(default=None)
100 model_config = ConfigDict(arbitrary_types_allowed=True)
102 @property
103 def progress_bar(self) -> ProgressBar:
104 if not self._progress_bar:
105 self._progress_bar = alive_bar(title="Running", unknown="waves") # type: ignore
106 return self._progress_bar # type: ignore # pyright not handling the protocol correctly, not sure why
108 @property
109 def openai_url(self) -> str | None:
110 return self.client.settings.openai_url
112 @property
113 def openai_key(self) -> str | None:
114 return self.client.settings.openai_key
116 @property
117 def openai_model(self) -> str:
118 return self.client.settings.openai_model or ScriptDefaults.MODEL
120 @property
121 def openai(self) -> OpenAI:
122 if not self._openai:
123 if self.openai_url:
124 logger.info("Using custom OpenAI URL: %s", self.openai_url)
125 self._openai = OpenAI(api_key=self.openai_key, base_url=self.openai_url)
126 else:
127 logger.info("Using default OpenAI URL")
128 self._openai = OpenAI()
129 return self._openai
131 @field_validator("max_threads", mode="before")
132 @classmethod
133 def validate_max_threads(cls, value) -> int:
134 # Sensible default
135 if not value:
136 # default is between 1-4 threads. More than 4 presumptively stresses the HDD non-optimally.
137 if not (cpu_count := os.cpu_count()):
138 cpu_count = 1
139 return max(1, min(4, round(cpu_count / 2)))
141 if value < 1:
142 raise ValueError("max_threads must be a positive integer.")
143 return value
145 @property
146 def jinja_env(self) -> Environment:
147 if not self._jinja_env:
148 templates_path = Path(__file__).parent / "templates"
149 self._jinja_env = Environment(loader=FileSystemLoader(str(templates_path)), autoescape=True)
150 return self._jinja_env
152 def choose_template(self, document: Document) -> str:
153 """
154 Choose a jinja template for a document
155 """
156 return "photo.jinja"
158 def get_prompt(self, document: Document) -> str:
159 """
160 Generate a prompt to sent to openai using a jinja template.
161 """
162 if self.prompt:
163 return self.prompt
165 template_name = self.choose_template(document)
166 template_path = f"templates/{template_name}"
167 logger.debug("Using template: %s", template_path)
168 template = self.jinja_env.get_template(template_path)
170 if not (description := template.render(document=document)):
171 raise ValueError("Failed to generate prompt.")
173 return description
175 def extract_images_from_pdf(self, pdf_bytes: bytes, max_images: int = 2) -> list[bytes]:
176 """
177 Extract the first image from a PDF file.
179 Args:
180 pdf_bytes (bytes): The PDF file content as bytes.
182 Returns:
183 bytes | None: The first {max_images} images as bytes or None if no image is found.
185 """
186 results: list[bytes] = []
187 image_count = 0
188 try:
189 # Open the PDF from bytes
190 pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
192 for page_number in range(len(pdf_document)):
193 if len(results) >= max_images:
194 break
196 page = pdf_document[page_number]
197 images = page.get_images(full=True)
199 if not images:
200 continue
202 for image in images:
203 image_count += 1
204 if len(results) >= max_images:
205 break
207 try:
208 xref = image[0]
209 base_image = pdf_document.extract_image(xref)
210 image_bytes = base_image["image"]
211 results.append(image_bytes)
212 logger.debug(f"Extracted image from page {page_number + 1} of the PDF.")
213 except Exception as e:
214 count = len(results)
215 logger.error(
216 "Failed to extract one image from page %s of PDF. Result count %s: %s",
217 page_number + 1,
218 count,
219 e,
220 )
221 if count < 1:
222 raise
224 except Exception as e:
225 logger.error(f"extract_images_from_pdf: Error extracting image from PDF: {e}")
226 raise DocumentParsingError("Error extracting image from PDF.") from e
228 if not results:
229 if image_count < 1:
230 raise NoImagesError("No images found in the PDF")
231 raise DocumentParsingError("Unable to extract images from PDF.")
233 return results
235 def parse_date(self, date_str: str) -> date | None:
236 """
237 Parse a date string.
239 Args:
240 date_str (str): The date string to parse.
242 Returns:
243 date: The parsed date.
245 """
246 if not (parsed_date := self.parse_datetime(date_str)):
247 return None
248 return parsed_date.date()
250 def parse_datetime(self, date_str: str) -> datetime | None:
251 """
252 Parse a date string.
254 Args:
255 date_str (str): The date string to parse.
257 Returns:
258 date: The parsed date.
260 """
261 if not date_str:
262 return None
264 date_str = str(date_str).strip()
266 # "Date unknown" or "Unknown date" or "No date"
267 if re.match(r"(date unknown|unknown date|no date|none|unknown|n/?a)$", date_str, re.IGNORECASE):
268 return None
270 # Handle "circa 1950"
271 if matches := re.match(r"((around|circa|mid|early|late|before|after) *)?(\d{4})s?$", date_str, re.IGNORECASE):
272 date_str = f"{matches.group(3)}-01-01"
274 parsed_date = dateparser.parse(date_str)
275 if not parsed_date:
276 raise ValueError(f"Invalid date format: {date_str=}")
277 return parsed_date
279 def standardize_image_contents(self, content: bytes) -> list[str]:
280 """
281 Standardize image contents to base64-encoded PNG format.
282 """
283 try:
284 return [self._convert_to_png(content)]
285 except Exception as e:
286 logger.debug(f"Failed to convert contents to png, will try other methods: {e}")
288 # Interpret it as a pdf
289 if image_contents_list := self.extract_images_from_pdf(content):
290 return [self._convert_to_png(image) for image in image_contents_list]
292 return []
294 def _convert_to_png(self, content: bytes) -> str:
295 img = Image.open(BytesIO(content))
297 # Resize large images
298 if img.size[0] > 1024 or img.size[1] > 1024:
299 img.thumbnail((1024, 1024))
301 # Re-save it as PNG in-memory
302 buf = BytesIO()
303 img.save(buf, format="PNG")
304 buf.seek(0)
306 # Convert to base64
307 return base64.b64encode(buf.read()).decode("utf-8")
309 def _send_describe_request(self, content: bytes | list[bytes], document: Document) -> str | None:
310 """
311 Send an image description request to OpenAI.
313 Args:
314 content: Document content as bytes or list of bytes
315 document: The document to describe
317 Returns:
318 str: The description generated by OpenAI
320 """
321 description: str | None = None
322 if not isinstance(content, list):
323 content = [content]
325 try:
326 # Convert all images to standardized format
327 images = []
328 for image_content in content:
329 images.extend(self.standardize_image_contents(image_content))
331 if not images:
332 raise NoImagesError("No images found to describe.")
334 message_contents: List[Dict[str, Any]] = [
335 {
336 "type": "text",
337 "text": self.get_prompt(document),
338 }
339 ]
341 for image in images:
342 message_contents.append(
343 {
344 "type": "image_url",
345 "image_url": {"url": f"data:image/png;base64,{image}"},
346 }
347 )
349 response = self.openai.chat.completions.create(
350 model=self.openai_model,
351 messages=[
352 {"role": "user", "content": message_contents} # type: ignore
353 ],
354 max_tokens=500,
355 )
356 description = response.choices[0].message.content
357 logger.debug(f"Generated description: {description}")
359 except fitz.FileDataError as fde:
360 logger.error(
361 "Failed to generate description due to error reading file #%s: %s -> %s",
362 document.id,
363 document.original_file_name,
364 fde,
365 )
367 except ValueError as ve:
368 logger.warning(
369 "Failed to generate description for document #%s: %s. Continuing with next image -> %s",
370 document.id,
371 document.original_file_name,
372 ve,
373 )
375 except UnidentifiedImageError as uii:
376 logger.warning(
377 "Failed to identify image format for document #%s: %s. Continuing with next image -> %s",
378 document.id,
379 document.original_file_name,
380 uii,
381 )
383 except openai.APIConnectionError as ace:
384 logger.error(
385 "API Connection Error. Is the OpenAI API URL correct? URL: %s, model: %s -> %s",
386 self.openai_url,
387 self.openai_model,
388 ace,
389 )
390 raise
392 return description
394 def convert_image_to_jpg(self, bytes_content: bytes) -> bytes:
395 """
396 Convert an image to JPEG format.
398 Args:
399 bytes_content (bytes): The image content as bytes.
401 Returns:
402 bytes: The image content as JPEG.
404 """
405 try:
406 img = Image.open(BytesIO(bytes_content))
407 buf = BytesIO()
408 img.save(buf, format="JPEG")
409 buf.seek(0)
410 return buf.read()
411 except Exception as e:
412 logger.error(f"Failed to convert image to JPEG: {e}")
413 raise
415 def describe_document(self, document: Document) -> bool:
416 """
417 Describe a single document using OpenAI's GPT-4o model.
419 The document object passed in will be updated with the description.
421 Args:
422 document: The document to describe.
424 Returns:
425 bool: True if the document was successfully described
427 """
428 response = None
429 try:
430 logger.debug(f"Describing document {document.id} using OpenAI...")
432 if not (content := document.content):
433 logger.error("Document content is empty for document #%s", document.id)
434 return False
436 # Ensure accepted format
437 original_file_name = (document.original_file_name or "").lower()
438 if not any(original_file_name.endswith(ext) for ext in DESCRIBE_ACCEPTED_FORMATS):
439 logger.error(f"Document {document.id} has unsupported extension: {original_file_name}")
440 return False
442 try:
443 # Convert content to bytes if it's a string
444 content_bytes = content if isinstance(content, bytes) else content.encode("utf-8")
445 if not (response := self._send_describe_request(content_bytes, document)):
446 logger.error(f"OpenAI returned empty description for document {document.id}.")
447 return False
448 except NoImagesError as nie:
449 logger.debug(f"No images found in document {document.id}: {nie}")
450 return False
451 except DocumentParsingError as dpe:
452 logger.error(f"Failed to parse document {document.id}: {dpe}")
453 return False
454 except openai.BadRequestError as e:
455 if "invalid_image_format" not in str(e):
456 logger.error(
457 "Failed to generate description for document #%s: %s -> %s",
458 document.id,
459 document.original_file_name,
460 e,
461 )
462 return False
464 logger.debug("Bad format for document #%s: %s -> %s", document.id, document.original_file_name, e)
465 return False
467 # Process the response
468 self.process_response(response, document)
469 except requests.RequestException as e:
470 logger.error(f"Failed to describe document {document.id}. {response=} => {e}")
471 raise
473 return True
475 def process_response(self, response: str, document: Document) -> Document:
476 """
477 Process the response from OpenAI and update the document.
479 Args:
480 response (str): The response from OpenAI
481 document (Document): The document to update
483 Returns:
484 Document: The updated document
486 """
487 # Attempt to parse response as json
488 try:
489 if not (parsed_response := json.loads(response)):
490 logger.debug("Unable to process response after failed json parsing")
491 return document
492 except json.JSONDecodeError as jde:
493 logger.error("Failed to parse response as JSON: %s", jde)
494 return document
496 # Check if parsed_response is a dictionary
497 if not isinstance(parsed_response, dict):
498 logger.error(
499 "Parsed response not a dictionary. Saving response raw to document.content. Document #%s: %s",
500 document.id,
501 document.original_file_name,
502 )
503 document.append_content(response)
504 return document
506 # Attempt to grab "title", "description", "tags", "date" from parsed_response
507 title = parsed_response.get("title", None)
508 description = parsed_response.get("description", None)
509 summary = parsed_response.get("summary", None)
510 content = parsed_response.get("content", None)
511 tags = parsed_response.get("tags", None)
512 date = parsed_response.get("date", None)
513 full_description = f"""AI IMAGE DESCRIPTION (v{SCRIPT_VERSION}):
514 The following description was provided by an Artificial Intelligence (GPT-4o by OpenAI).
515 It may not be fully accurate. Its purpose is to provide keywords and context
516 so that the document can be more easily searched.
517 Suggested Title: {title}
518 Inferred Date: {date}
519 Suggested Tags: {tags}
520 Previous Title: {document.title}
521 Previous Date: {document.created}
522 """
524 if summary:
525 full_description += f"\n\nSummary: {summary}"
526 if content:
527 full_description += f"\n\nContent: {content}"
528 if description:
529 full_description += f"\n\nDescription: {description}"
530 if not any([description, summary, content]):
531 full_description += f"\n\nFull AI Response: {parsed_response}"
533 if title and ScriptDefaults.NEEDS_TITLE in document.tag_names:
534 try:
535 document.title = str(title)
536 document.remove_tag(ScriptDefaults.NEEDS_TITLE)
537 except Exception as e:
538 logger.error(
539 "Failed to update document title. Document #%s: %s -> %s",
540 document.id,
541 document.original_file_name,
542 e,
543 )
545 if date and "ScriptDefaults.NEEDS_DATE" in document.tag_names:
546 try:
547 document.created = date # type: ignore # pydantic will handle casting
548 document.remove_tag("ScriptDefaults.NEEDS_DATE")
549 except Exception as e:
550 logger.error(
551 "Failed to update document date. Document #%s: %s -> %s",
552 document.id,
553 document.original_file_name,
554 e,
555 )
557 # Append the description to the document
558 document.content = full_description
559 document.remove_tag("ScriptDefaults.NEEDS_DESCRIPTION")
560 document.add_tag("described")
562 logger.debug(f"Successfully described document {document.id}")
563 return document
565 def describe_documents(self, documents: list[Document] | None = None) -> list[Document]:
566 """
567 Describe a list of documents using OpenAI's GPT-4o model.
569 Args:
570 documents (list[Document]): The documents to describe.
572 Returns:
573 list[Document]: The documents with the descriptions added.
575 """
576 logger.info("Fetching documents to describe...")
577 if documents is None:
578 documents = list(self.client.documents().filter(tag_name=self.paperless_tag))
580 total = len(documents)
581 logger.info(f"Found {total} documents to describe")
583 results = []
584 with alive_bar(total=total, title="Describing documents", bar="classic") as self._progress_bar:
585 for document in documents:
586 if self.describe_document(document):
587 results.append(document)
588 self.progress_bar()
589 return results
592class ArgNamespace(argparse.Namespace):
593 """
594 A custom namespace class for argparse.
595 """
597 url: str
598 key: str
599 model: str | None = None
600 openai_url: str | None = None
601 tag: str
602 prompt: str | None = None
603 verbose: bool = False
606def main():
607 """
608 Run the script.
609 """
610 logger = setup_logging()
611 try:
612 load_dotenv()
614 parser = argparse.ArgumentParser(description="Describe documents with AI in Paperless-ngx")
615 parser.add_argument("--url", type=str, default=None, help="The base URL of the Paperless NGX instance")
616 parser.add_argument("--key", type=str, default=None, help="The API token for the Paperless NGX instance")
617 parser.add_argument("--model", type=str, default=None, help="The OpenAI model to use")
618 parser.add_argument("--openai-url", type=str, default=None, help="The base URL for the OpenAI API")
619 parser.add_argument("--tag", type=str, default=ScriptDefaults.NEEDS_DESCRIPTION, help="Tag to filter documents")
620 parser.add_argument("--prompt", type=str, default=None, help="Prompt to use for OpenAI")
621 parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
623 args = parser.parse_args(namespace=ArgNamespace())
625 if args.verbose:
626 logger.setLevel(logging.DEBUG)
628 if not args.url:
629 logger.error("PAPERLESS_URL environment variable is not set.")
630 sys.exit(1)
632 if not args.key:
633 logger.error("PAPERLESS_KEY environment variable is not set.")
634 sys.exit(1)
636 # Exclude None, so pydantic settings loads from defaults for an unset param
637 config = {
638 k: v
639 for k, v in {
640 "base_url": args.url,
641 "token": args.key,
642 "openai_url": args.openai_url,
643 "openai_model": args.model,
644 }.items()
645 if v is not None
646 }
647 # Cast to Any to avoid type checking issues with **kwargs
648 settings = Settings(**cast(Any, config))
649 client = PaperlessClient(settings)
651 paperless = DescribePhotos(client=client, prompt=args.prompt)
653 logger.info(f"Starting document description process with model: {paperless.openai_model}")
654 results = paperless.describe_documents()
656 if results:
657 logger.info(f"Successfully described {len(results)} documents")
658 else:
659 logger.info("No documents described.")
661 except KeyboardInterrupt:
662 logger.info("Script cancelled by user.")
663 sys.exit(0)
664 except Exception as e:
665 logger.error(f"An error occurred: {e}", exc_info=True)
666 sys.exit(1)
669if __name__ == "__main__":
670 main()