Coverage for src/paperap/scripts/describe.py: 85%

343 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-20 13:17 -0400

1""" 

2 

3 

4 

5 

6---------------------------------------------------------------------------- 

7 

8METADATA: 

9 

10File: describe.py 

11 Project: paperap 

12Created: 2025-03-18 

13 Version: 0.0.8 

14Author: Jess Mann 

15Email: jess@jmann.me 

16 Copyright (c) 2025 Jess Mann 

17 

18---------------------------------------------------------------------------- 

19 

20LAST MODIFIED: 

21 

222025-03-18 By Jess Mann 

23 

24""" 

25 

26from __future__ import annotations 

27 

28import argparse 

29import base64 

30import json 

31import logging 

32import os 

33import re 

34import sys 

35from datetime import date, datetime 

36from enum import StrEnum 

37from functools import singledispatchmethod 

38from io import BytesIO 

39from pathlib import Path 

40from typing import Any, Collection, Dict, Iterator, List, Optional, TypeVar, Union, cast 

41 

42import dateparser 

43import fitz # type: ignore 

44import openai 

45import openai.types.chat 

46import requests 

47from alive_progress import alive_bar # type: ignore 

48from dotenv import load_dotenv 

49from jinja2 import Environment, FileSystemLoader 

50from openai import OpenAI 

51from PIL import Image, UnidentifiedImageError 

52from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, field_validator 

53 

54from paperap.client import PaperlessClient 

55from paperap.exceptions import DocumentParsingError, NoImagesError 

56from paperap.models.document import Document 

57from paperap.models.document.queryset import DocumentQuerySet 

58from paperap.models.tag import Tag 

59from paperap.scripts.utils import ProgressBar, setup_logging 

60from paperap.settings import Settings 

61 

62logger = logging.getLogger(__name__) 

63 

64DESCRIBE_ACCEPTED_FORMATS = ["png", "jpg", "jpeg", "gif", "tif", "tiff", "bmp", "webp", "pdf"] 

65OPENAI_ACCEPTED_FORMATS = ["png", "jpg", "jpeg", "gif", "webp", "pdf"] 

66MIME_TYPES = { 

67 "png": "image/png", 

68 "jpeg": "image/jpeg", 

69 "jpg": "image/jpeg", 

70 "gif": "image/gif", 

71 "webp": "image/webp", 

72} 

73 

74 

75class ScriptDefaults(StrEnum): 

76 NEEDS_DESCRIPTION = "needs-description" 

77 DESCRIBED = "described" 

78 NEEDS_TITLE = "needs-title" 

79 NEEDS_DATE = "needs-date" 

80 MODEL = "gpt-4o-mini" 

81 

82 

83SCRIPT_VERSION = "0.2.2" 

84 

85 

86class DescribePhotos(BaseModel): 

87 """ 

88 Describes photos in the Paperless NGX instance using an LLM (such as OpenAI's GPT-4o-mini model). 

89 """ 

90 

91 max_threads: int = 0 

92 paperless_tag: str | None = Field(default=ScriptDefaults.NEEDS_DESCRIPTION) 

93 prompt: str | None = Field(None) 

94 client: PaperlessClient 

95 _jinja_env: Environment | None = PrivateAttr(default=None) 

96 _progress_bar: ProgressBar | None = PrivateAttr(default=None) 

97 _progress_message: str | None = PrivateAttr(default=None) 

98 _openai: OpenAI | None = PrivateAttr(default=None) 

99 

100 model_config = ConfigDict(arbitrary_types_allowed=True) 

101 

102 @property 

103 def progress_bar(self) -> ProgressBar: 

104 if not self._progress_bar: 

105 self._progress_bar = alive_bar(title="Running", unknown="waves") # type: ignore 

106 return self._progress_bar # type: ignore # pyright not handling the protocol correctly, not sure why 

107 

108 @property 

109 def openai_url(self) -> str | None: 

110 return self.client.settings.openai_url 

111 

112 @property 

113 def openai_key(self) -> str | None: 

114 return self.client.settings.openai_key 

115 

116 @property 

117 def openai_model(self) -> str: 

118 return self.client.settings.openai_model or ScriptDefaults.MODEL 

119 

120 @property 

121 def openai(self) -> OpenAI: 

122 if not self._openai: 

123 if self.openai_url: 

124 logger.info("Using custom OpenAI URL: %s", self.openai_url) 

125 self._openai = OpenAI(api_key=self.openai_key, base_url=self.openai_url) 

126 else: 

127 logger.info("Using default OpenAI URL") 

128 self._openai = OpenAI() 

129 return self._openai 

130 

131 @field_validator("max_threads", mode="before") 

132 @classmethod 

133 def validate_max_threads(cls, value) -> int: 

134 # Sensible default 

135 if not value: 

136 # default is between 1-4 threads. More than 4 presumptively stresses the HDD non-optimally. 

137 if not (cpu_count := os.cpu_count()): 

138 cpu_count = 1 

139 return max(1, min(4, round(cpu_count / 2))) 

140 

141 if value < 1: 

142 raise ValueError("max_threads must be a positive integer.") 

143 return value 

144 

145 @property 

146 def jinja_env(self) -> Environment: 

147 if not self._jinja_env: 

148 templates_path = Path(__file__).parent / "templates" 

149 self._jinja_env = Environment(loader=FileSystemLoader(str(templates_path)), autoescape=True) 

150 return self._jinja_env 

151 

152 def choose_template(self, document: Document) -> str: 

153 """ 

154 Choose a jinja template for a document 

155 """ 

156 return "photo.jinja" 

157 

158 def get_prompt(self, document: Document) -> str: 

159 """ 

160 Generate a prompt to sent to openai using a jinja template. 

161 """ 

162 if self.prompt: 

163 return self.prompt 

164 

165 template_name = self.choose_template(document) 

166 template_path = f"templates/{template_name}" 

167 logger.debug("Using template: %s", template_path) 

168 template = self.jinja_env.get_template(template_path) 

169 

170 if not (description := template.render(document=document)): 

171 raise ValueError("Failed to generate prompt.") 

172 

173 return description 

174 

175 def extract_images_from_pdf(self, pdf_bytes: bytes, max_images: int = 2) -> list[bytes]: 

176 """ 

177 Extract the first image from a PDF file. 

178 

179 Args: 

180 pdf_bytes (bytes): The PDF file content as bytes. 

181 

182 Returns: 

183 bytes | None: The first {max_images} images as bytes or None if no image is found. 

184 

185 """ 

186 results: list[bytes] = [] 

187 image_count = 0 

188 try: 

189 # Open the PDF from bytes 

190 pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") 

191 

192 for page_number in range(len(pdf_document)): 

193 if len(results) >= max_images: 

194 break 

195 

196 page = pdf_document[page_number] 

197 images = page.get_images(full=True) 

198 

199 if not images: 

200 continue 

201 

202 for image in images: 

203 image_count += 1 

204 if len(results) >= max_images: 

205 break 

206 

207 try: 

208 xref = image[0] 

209 base_image = pdf_document.extract_image(xref) 

210 image_bytes = base_image["image"] 

211 results.append(image_bytes) 

212 logger.debug(f"Extracted image from page {page_number + 1} of the PDF.") 

213 except Exception as e: 

214 count = len(results) 

215 logger.error( 

216 "Failed to extract one image from page %s of PDF. Result count %s: %s", 

217 page_number + 1, 

218 count, 

219 e, 

220 ) 

221 if count < 1: 

222 raise 

223 

224 except Exception as e: 

225 logger.error(f"extract_images_from_pdf: Error extracting image from PDF: {e}") 

226 raise DocumentParsingError("Error extracting image from PDF.") from e 

227 

228 if not results: 

229 if image_count < 1: 

230 raise NoImagesError("No images found in the PDF") 

231 raise DocumentParsingError("Unable to extract images from PDF.") 

232 

233 return results 

234 

235 def parse_date(self, date_str: str) -> date | None: 

236 """ 

237 Parse a date string. 

238 

239 Args: 

240 date_str (str): The date string to parse. 

241 

242 Returns: 

243 date: The parsed date. 

244 

245 """ 

246 if not (parsed_date := self.parse_datetime(date_str)): 

247 return None 

248 return parsed_date.date() 

249 

250 def parse_datetime(self, date_str: str) -> datetime | None: 

251 """ 

252 Parse a date string. 

253 

254 Args: 

255 date_str (str): The date string to parse. 

256 

257 Returns: 

258 date: The parsed date. 

259 

260 """ 

261 if not date_str: 

262 return None 

263 

264 date_str = str(date_str).strip() 

265 

266 # "Date unknown" or "Unknown date" or "No date" 

267 if re.match(r"(date unknown|unknown date|no date|none|unknown|n/?a)$", date_str, re.IGNORECASE): 

268 return None 

269 

270 # Handle "circa 1950" 

271 if matches := re.match(r"((around|circa|mid|early|late|before|after) *)?(\d{4})s?$", date_str, re.IGNORECASE): 

272 date_str = f"{matches.group(3)}-01-01" 

273 

274 parsed_date = dateparser.parse(date_str) 

275 if not parsed_date: 

276 raise ValueError(f"Invalid date format: {date_str=}") 

277 return parsed_date 

278 

279 def standardize_image_contents(self, content: bytes) -> list[str]: 

280 """ 

281 Standardize image contents to base64-encoded PNG format. 

282 """ 

283 try: 

284 return [self._convert_to_png(content)] 

285 except Exception as e: 

286 logger.debug(f"Failed to convert contents to png, will try other methods: {e}") 

287 

288 # Interpret it as a pdf 

289 if image_contents_list := self.extract_images_from_pdf(content): 

290 return [self._convert_to_png(image) for image in image_contents_list] 

291 

292 return [] 

293 

294 def _convert_to_png(self, content: bytes) -> str: 

295 img = Image.open(BytesIO(content)) 

296 

297 # Resize large images 

298 if img.size[0] > 1024 or img.size[1] > 1024: 

299 img.thumbnail((1024, 1024)) 

300 

301 # Re-save it as PNG in-memory 

302 buf = BytesIO() 

303 img.save(buf, format="PNG") 

304 buf.seek(0) 

305 

306 # Convert to base64 

307 return base64.b64encode(buf.read()).decode("utf-8") 

308 

309 def _send_describe_request(self, content: bytes | list[bytes], document: Document) -> str | None: 

310 """ 

311 Send an image description request to OpenAI. 

312 

313 Args: 

314 content: Document content as bytes or list of bytes 

315 document: The document to describe 

316 

317 Returns: 

318 str: The description generated by OpenAI 

319 

320 """ 

321 description: str | None = None 

322 if not isinstance(content, list): 

323 content = [content] 

324 

325 try: 

326 # Convert all images to standardized format 

327 images = [] 

328 for image_content in content: 

329 images.extend(self.standardize_image_contents(image_content)) 

330 

331 if not images: 

332 raise NoImagesError("No images found to describe.") 

333 

334 message_contents: List[Dict[str, Any]] = [ 

335 { 

336 "type": "text", 

337 "text": self.get_prompt(document), 

338 } 

339 ] 

340 

341 for image in images: 

342 message_contents.append( 

343 { 

344 "type": "image_url", 

345 "image_url": {"url": f"data:image/png;base64,{image}"}, 

346 } 

347 ) 

348 

349 response = self.openai.chat.completions.create( 

350 model=self.openai_model, 

351 messages=[ 

352 {"role": "user", "content": message_contents} # type: ignore 

353 ], 

354 max_tokens=500, 

355 ) 

356 description = response.choices[0].message.content 

357 logger.debug(f"Generated description: {description}") 

358 

359 except fitz.FileDataError as fde: 

360 logger.error( 

361 "Failed to generate description due to error reading file #%s: %s -> %s", 

362 document.id, 

363 document.original_file_name, 

364 fde, 

365 ) 

366 

367 except ValueError as ve: 

368 logger.warning( 

369 "Failed to generate description for document #%s: %s. Continuing with next image -> %s", 

370 document.id, 

371 document.original_file_name, 

372 ve, 

373 ) 

374 

375 except UnidentifiedImageError as uii: 

376 logger.warning( 

377 "Failed to identify image format for document #%s: %s. Continuing with next image -> %s", 

378 document.id, 

379 document.original_file_name, 

380 uii, 

381 ) 

382 

383 except openai.APIConnectionError as ace: 

384 logger.error( 

385 "API Connection Error. Is the OpenAI API URL correct? URL: %s, model: %s -> %s", 

386 self.openai_url, 

387 self.openai_model, 

388 ace, 

389 ) 

390 raise 

391 

392 return description 

393 

394 def convert_image_to_jpg(self, bytes_content: bytes) -> bytes: 

395 """ 

396 Convert an image to JPEG format. 

397 

398 Args: 

399 bytes_content (bytes): The image content as bytes. 

400 

401 Returns: 

402 bytes: The image content as JPEG. 

403 

404 """ 

405 try: 

406 img = Image.open(BytesIO(bytes_content)) 

407 buf = BytesIO() 

408 img.save(buf, format="JPEG") 

409 buf.seek(0) 

410 return buf.read() 

411 except Exception as e: 

412 logger.error(f"Failed to convert image to JPEG: {e}") 

413 raise 

414 

415 def describe_document(self, document: Document) -> bool: 

416 """ 

417 Describe a single document using OpenAI's GPT-4o model. 

418 

419 The document object passed in will be updated with the description. 

420 

421 Args: 

422 document: The document to describe. 

423 

424 Returns: 

425 bool: True if the document was successfully described 

426 

427 """ 

428 response = None 

429 try: 

430 logger.debug(f"Describing document {document.id} using OpenAI...") 

431 

432 if not (content := document.content): 

433 logger.error("Document content is empty for document #%s", document.id) 

434 return False 

435 

436 # Ensure accepted format 

437 original_file_name = (document.original_file_name or "").lower() 

438 if not any(original_file_name.endswith(ext) for ext in DESCRIBE_ACCEPTED_FORMATS): 

439 logger.error(f"Document {document.id} has unsupported extension: {original_file_name}") 

440 return False 

441 

442 try: 

443 # Convert content to bytes if it's a string 

444 content_bytes = content if isinstance(content, bytes) else content.encode("utf-8") 

445 if not (response := self._send_describe_request(content_bytes, document)): 

446 logger.error(f"OpenAI returned empty description for document {document.id}.") 

447 return False 

448 except NoImagesError as nie: 

449 logger.debug(f"No images found in document {document.id}: {nie}") 

450 return False 

451 except DocumentParsingError as dpe: 

452 logger.error(f"Failed to parse document {document.id}: {dpe}") 

453 return False 

454 except openai.BadRequestError as e: 

455 if "invalid_image_format" not in str(e): 

456 logger.error( 

457 "Failed to generate description for document #%s: %s -> %s", 

458 document.id, 

459 document.original_file_name, 

460 e, 

461 ) 

462 return False 

463 

464 logger.debug("Bad format for document #%s: %s -> %s", document.id, document.original_file_name, e) 

465 return False 

466 

467 # Process the response 

468 self.process_response(response, document) 

469 except requests.RequestException as e: 

470 logger.error(f"Failed to describe document {document.id}. {response=} => {e}") 

471 raise 

472 

473 return True 

474 

475 def process_response(self, response: str, document: Document) -> Document: 

476 """ 

477 Process the response from OpenAI and update the document. 

478 

479 Args: 

480 response (str): The response from OpenAI 

481 document (Document): The document to update 

482 

483 Returns: 

484 Document: The updated document 

485 

486 """ 

487 # Attempt to parse response as json 

488 try: 

489 if not (parsed_response := json.loads(response)): 

490 logger.debug("Unable to process response after failed json parsing") 

491 return document 

492 except json.JSONDecodeError as jde: 

493 logger.error("Failed to parse response as JSON: %s", jde) 

494 return document 

495 

496 # Check if parsed_response is a dictionary 

497 if not isinstance(parsed_response, dict): 

498 logger.error( 

499 "Parsed response not a dictionary. Saving response raw to document.content. Document #%s: %s", 

500 document.id, 

501 document.original_file_name, 

502 ) 

503 document.append_content(response) 

504 return document 

505 

506 # Attempt to grab "title", "description", "tags", "date" from parsed_response 

507 title = parsed_response.get("title", None) 

508 description = parsed_response.get("description", None) 

509 summary = parsed_response.get("summary", None) 

510 content = parsed_response.get("content", None) 

511 tags = parsed_response.get("tags", None) 

512 date = parsed_response.get("date", None) 

513 full_description = f"""AI IMAGE DESCRIPTION (v{SCRIPT_VERSION}): 

514 The following description was provided by an Artificial Intelligence (GPT-4o by OpenAI). 

515 It may not be fully accurate. Its purpose is to provide keywords and context 

516 so that the document can be more easily searched. 

517 Suggested Title: {title} 

518 Inferred Date: {date} 

519 Suggested Tags: {tags} 

520 Previous Title: {document.title} 

521 Previous Date: {document.created} 

522 """ 

523 

524 if summary: 

525 full_description += f"\n\nSummary: {summary}" 

526 if content: 

527 full_description += f"\n\nContent: {content}" 

528 if description: 

529 full_description += f"\n\nDescription: {description}" 

530 if not any([description, summary, content]): 

531 full_description += f"\n\nFull AI Response: {parsed_response}" 

532 

533 if title and ScriptDefaults.NEEDS_TITLE in document.tag_names: 

534 try: 

535 document.title = str(title) 

536 document.remove_tag(ScriptDefaults.NEEDS_TITLE) 

537 except Exception as e: 

538 logger.error( 

539 "Failed to update document title. Document #%s: %s -> %s", 

540 document.id, 

541 document.original_file_name, 

542 e, 

543 ) 

544 

545 if date and "ScriptDefaults.NEEDS_DATE" in document.tag_names: 

546 try: 

547 document.created = date # type: ignore # pydantic will handle casting 

548 document.remove_tag("ScriptDefaults.NEEDS_DATE") 

549 except Exception as e: 

550 logger.error( 

551 "Failed to update document date. Document #%s: %s -> %s", 

552 document.id, 

553 document.original_file_name, 

554 e, 

555 ) 

556 

557 # Append the description to the document 

558 document.content = full_description 

559 document.remove_tag("ScriptDefaults.NEEDS_DESCRIPTION") 

560 document.add_tag("described") 

561 

562 logger.debug(f"Successfully described document {document.id}") 

563 return document 

564 

565 def describe_documents(self, documents: list[Document] | None = None) -> list[Document]: 

566 """ 

567 Describe a list of documents using OpenAI's GPT-4o model. 

568 

569 Args: 

570 documents (list[Document]): The documents to describe. 

571 

572 Returns: 

573 list[Document]: The documents with the descriptions added. 

574 

575 """ 

576 logger.info("Fetching documents to describe...") 

577 if documents is None: 

578 documents = list(self.client.documents().filter(tag_name=self.paperless_tag)) 

579 

580 total = len(documents) 

581 logger.info(f"Found {total} documents to describe") 

582 

583 results = [] 

584 with alive_bar(total=total, title="Describing documents", bar="classic") as self._progress_bar: 

585 for document in documents: 

586 if self.describe_document(document): 

587 results.append(document) 

588 self.progress_bar() 

589 return results 

590 

591 

592class ArgNamespace(argparse.Namespace): 

593 """ 

594 A custom namespace class for argparse. 

595 """ 

596 

597 url: str 

598 key: str 

599 model: str | None = None 

600 openai_url: str | None = None 

601 tag: str 

602 prompt: str | None = None 

603 verbose: bool = False 

604 

605 

606def main(): 

607 """ 

608 Run the script. 

609 """ 

610 logger = setup_logging() 

611 try: 

612 load_dotenv() 

613 

614 parser = argparse.ArgumentParser(description="Describe documents with AI in Paperless-ngx") 

615 parser.add_argument("--url", type=str, default=None, help="The base URL of the Paperless NGX instance") 

616 parser.add_argument("--key", type=str, default=None, help="The API token for the Paperless NGX instance") 

617 parser.add_argument("--model", type=str, default=None, help="The OpenAI model to use") 

618 parser.add_argument("--openai-url", type=str, default=None, help="The base URL for the OpenAI API") 

619 parser.add_argument("--tag", type=str, default=ScriptDefaults.NEEDS_DESCRIPTION, help="Tag to filter documents") 

620 parser.add_argument("--prompt", type=str, default=None, help="Prompt to use for OpenAI") 

621 parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") 

622 

623 args = parser.parse_args(namespace=ArgNamespace()) 

624 

625 if args.verbose: 

626 logger.setLevel(logging.DEBUG) 

627 

628 if not args.url: 

629 logger.error("PAPERLESS_URL environment variable is not set.") 

630 sys.exit(1) 

631 

632 if not args.key: 

633 logger.error("PAPERLESS_KEY environment variable is not set.") 

634 sys.exit(1) 

635 

636 # Exclude None, so pydantic settings loads from defaults for an unset param 

637 config = { 

638 k: v 

639 for k, v in { 

640 "base_url": args.url, 

641 "token": args.key, 

642 "openai_url": args.openai_url, 

643 "openai_model": args.model, 

644 }.items() 

645 if v is not None 

646 } 

647 # Cast to Any to avoid type checking issues with **kwargs 

648 settings = Settings(**cast(Any, config)) 

649 client = PaperlessClient(settings) 

650 

651 paperless = DescribePhotos(client=client, prompt=args.prompt) 

652 

653 logger.info(f"Starting document description process with model: {paperless.openai_model}") 

654 results = paperless.describe_documents() 

655 

656 if results: 

657 logger.info(f"Successfully described {len(results)} documents") 

658 else: 

659 logger.info("No documents described.") 

660 

661 except KeyboardInterrupt: 

662 logger.info("Script cancelled by user.") 

663 sys.exit(0) 

664 except Exception as e: 

665 logger.error(f"An error occurred: {e}", exc_info=True) 

666 sys.exit(1) 

667 

668 

669if __name__ == "__main__": 

670 main()