mrblack
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# 4# File: __init__.py 5# Author: Wadih Khairallah 6# Description: 7# Created: 2025-05-12 16:47:22 8# Modified: 2025-05-15 16:30:26 9 10from .pii import ( 11 extract_pii_text, 12 extract_pii_file, 13 extract_pii_url, 14 extract_pii_image, 15 extract_pii_screenshot 16) 17from .textextract import ( 18 extract_text, 19 extract_text_with_password, 20 extract_exif, 21 extract_metadata, 22 text_from_screenshot, 23 text_from_url, 24 text_from_html, 25 text_from_audio, 26 text_from_pdf, 27 text_from_doc, 28 text_from_docx, 29 text_from_excel, 30 text_from_image, 31 text_from_any, 32 text_from_odt, 33 text_from_pptx, 34 text_from_epub, 35 analyze_text, 36 summarize_text, 37 translate_text, 38 list_available_languages, 39 detect_language, 40 scrape_website, 41 normalize_text, 42 43) 44 45__all__ = [ 46 "extract_pii_text", 47 "extract_pii_file", 48 "extract_pii_url", 49 "extract_pii_image", 50 "extract_pii_screenshot", 51 "extract_text_with_password", 52 "extract_text", 53 "extract_exif", 54 "extract_metadata", 55 "text_from_screenshot", 56 "text_from_url", 57 "text_from_html", 58 "text_from_audio", 59 "text_from_pdf", 60 "text_from_doc", 61 "text_from_docx", 62 "text_from_excel", 63 "text_from_image", 64 "text_from_any", 65 "text_from_odt", 66 "text_from_pptx", 67 "text_from_epub", 68 "analyze_text", 69 "summarize_text", 70 "translate_text", 71 "list_available_languages", 72 "detect_language", 73 "scrape_website", 74 "normalize_text" 75]
71def extract_pii_text( 72 text: str, 73 labels: Optional[Union[List[str], str]] = None 74) -> Dict[str, List[str]]: 75 """ 76 Extract PII matches from provided text. 77 78 Args: 79 text (str): The input text to scan for patterns. 80 labels (Optional[Union[List[str], str]]): Specific labels to filter on. 81 82 Returns: 83 Dict[str, List[str]]: Mapping of each label to a sorted list of 84 matched and cleaned strings. 85 """ 86 if isinstance(labels, str): 87 labels = [labels] 88 patterns = PATTERNS 89 if labels: 90 patterns = [ 91 p for p in PATTERNS 92 if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels) 93 ] 94 results: Dict[str, set] = defaultdict(set) 95 for pattern in patterns: 96 try: 97 rx = re.compile(pattern) 98 for m in rx.finditer(text): 99 for lbl, val in m.groupdict().items(): 100 if not val: 101 continue 102 cleaned = _clean_value(lbl, val) 103 if lbl == "url": 104 cleaned = cleaned.rstrip("),.**") 105 if cleaned is not None: 106 results[lbl].add(cleaned) 107 except re.error as e: 108 print( 109 f"Invalid regex skipped: {pattern} → {e}", 110 file=sys.stderr 111 ) 112 return {lbl: sorted(vals) for lbl, vals in results.items()}
Extract PII matches from provided text.
Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.
Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.
115def extract_pii_file( 116 file_path: str, 117 labels: Optional[Union[List[str], str]] = None 118) -> Optional[Dict[str, List[str]]]: 119 """ 120 Extract PII from a single file's text content. 121 122 Args: 123 file_path (str): Path to the file. 124 labels (Optional[Union[List[str], str]]): Labels to filter. 125 126 Returns: 127 Optional[Dict[str, List[str]]]: Extraction results, or None. 128 """ 129 text = extract_text(file_path) 130 if not text: 131 return None 132 data = extract_pii_text(text, labels) 133 return data or None
Extract PII from a single file's text content.
Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
136def extract_pii_url( 137 path: str, 138 labels: Optional[Union[List[str], str]] = None 139) -> Optional[Dict[str, List[str]]]: 140 """ 141 Extract PII from the text at a URL. 142 143 Args: 144 path (str): The URL to fetch. 145 labels (Optional[Union[List[str], str]]): Labels to filter. 146 147 Returns: 148 Optional[Dict[str, List[str]]]: Extraction results, or None. 149 """ 150 text = text_from_url(path) 151 if not text: 152 return None 153 data = extract_pii_text(text, labels) 154 return data or None
Extract PII from the text at a URL.
Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
157def extract_pii_image( 158 image_path: str, 159 labels: Optional[Union[List[str], str]] = None 160) -> Optional[Dict[str, List[str]]]: 161 """ 162 Extract PII from an image using OCR. 163 164 Args: 165 image_path (str): Path to the image file. 166 labels (Optional[Union[List[str], str]]): Labels to filter. 167 168 Returns: 169 Optional[Dict[str, List[str]]]: Extraction results, or None. 170 """ 171 path = clean_path(image_path) 172 if not path or not os.path.isfile(path): 173 print(f"[red]Invalid image path:[/] {image_path}") 174 return None 175 text = extract_text(path) 176 if not text: 177 return None 178 data = extract_pii_text(text, labels) 179 return data or None
Extract PII from an image using OCR.
Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
182def extract_pii_screenshot( 183 labels: Optional[Union[List[str], str]] = None 184) -> Optional[Dict[str, List[str]]]: 185 """ 186 Capture a screenshot and extract PII from its OCR text. 187 188 Args: 189 labels (Optional[Union[List[str], str]]): Labels to filter. 190 191 Returns: 192 Optional[Dict[str, List[str]]]: Extraction results, or None. 193 """ 194 text = text_from_screenshot() 195 if not text: 196 return None 197 data = extract_pii_text(text, labels) 198 return data or None
Capture a screenshot and extract PII from its OCR text.
Args: labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
502def extract_text_with_password(file_path: str, password: str) -> Optional[str]: 503 """ 504 Extract text from password-protected files. 505 506 Args: 507 file_path (str): Path to the file 508 password (str): Password to unlock the file 509 510 Returns: 511 Optional[str]: Extracted text 512 """ 513 file_ext = os.path.splitext(file_path)[1].lower() 514 515 if file_ext == '.pdf': 516 return text_from_pdf_protected(file_path, password) 517 elif file_ext in ['.docx', '.xlsx', '.pptx']: 518 return text_from_office_protected(file_path, password) 519 else: 520 logger.warning(f"Password protection not supported for {file_ext} files") 521 return None
Extract text from password-protected files.
Args: file_path (str): Path to the file password (str): Password to unlock the file
Returns: Optional[str]: Extracted text
416def extract_text( 417 file_path: str 418) -> Optional[str]: 419 """ 420 Extract text content from a local file or URL. 421 422 Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio. 423 424 Args: 425 file_path (str): Path to the input file or URL. 426 427 Returns: 428 Optional[str]: Extracted text, or None if unsupported or error. 429 """ 430 if is_url(file_path): 431 return text_from_url(file_path) 432 433 TEXT_MIME_TYPES = { 434 "application/json", "application/xml", "application/x-yaml", 435 "application/x-toml", "application/x-csv", "application/x-markdown", 436 } 437 438 path = clean_path(file_path) 439 if not path: 440 logger.error(f"No such file: {file_path}") 441 return None 442 443 mime_type = magic.from_file(path, mime=True) 444 try: 445 if mime_type.startswith("text/html"): 446 content = text_from_html(path) 447 return content 448 449 elif mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES: 450 with open(path, 'r', encoding='utf-8', errors='ignore') as f: 451 content = f.read() 452 return normalize_text(content) 453 454 elif mime_type in [ 455 "application/vnd.ms-excel", 456 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 457 ]: 458 content = text_from_excel(path) 459 return content 460 461 elif mime_type == "application/pdf": 462 content = text_from_pdf(path) 463 return content 464 465 elif mime_type == \ 466 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 467 content = text_from_docx(path) 468 return content 469 470 elif mime_type == "application/msword": 471 content = text_from_doc(path) 472 return content 473 474 elif mime_type.startswith("image/"): 475 content = text_from_image(path) 476 return content 477 478 elif mime_type.startswith("audio/"): 479 content = text_from_audio(path) 480 return content 481 482 elif mime_type == "application/epub+zip": 483 content = text_from_epub(path) 484 return content 485 486 elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": 487 content = text_from_pptx(path) 488 return content 489 490 elif mime_type == "application/vnd.oasis.opendocument.text": 491 content = text_from_odt(path) 492 return content 493 494 else: 495 content = text_from_any(path) 496 return content 497 except Exception as e: 498 logger.error(f"Error reading {path}: {e}") 499 return None
Extract text content from a local file or URL.
Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
Args: file_path (str): Path to the input file or URL.
Returns: Optional[str]: Extracted text, or None if unsupported or error.
197def extract_exif( 198 file_path: str 199) -> Optional[Dict[str, Any]]: 200 """ 201 Extract EXIF metadata from a file using exiftool. 202 203 Args: 204 file_path (str): Path to the target file. 205 206 Returns: 207 Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure. 208 """ 209 exif_data: Optional[Dict[str, Any]] = None 210 try: 211 result = subprocess.run( 212 ['exiftool', '-j', file_path], 213 stdout=subprocess.PIPE, 214 stderr=subprocess.PIPE 215 ) 216 if result.returncode == 0: 217 exif_data = json.loads(result.stdout.decode())[0] 218 except Exception as e: 219 logger.error(f"Exiftool failed: {e}") 220 return exif_data
Extract EXIF metadata from a file using exiftool.
Args: file_path (str): Path to the target file.
Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
1052def extract_metadata( 1053 file_path: str 1054) -> Dict[str, Any]: 1055 """ 1056 Extract comprehensive metadata from any file type. 1057 1058 Args: 1059 file_path (str): Path to target file. 1060 1061 Returns: 1062 Dict[str, Any]: Nested metadata structure. 1063 """ 1064 path = clean_path(file_path) 1065 if not path: 1066 return {"error": "File not found"} 1067 1068 meta: Dict[str, Any] = {} 1069 try: 1070 stats = os.stat(path) 1071 meta["size_bytes"] = stats.st_size 1072 meta["created"] = datetime.fromtimestamp(stats.st_ctime).isoformat() 1073 meta["modified"] = datetime.fromtimestamp(stats.st_mtime).isoformat() 1074 meta["mime"] = magic.from_file(path, mime=True) 1075 1076 # Calculate multiple hash types 1077 with open(path, 'rb') as f: 1078 content = f.read() 1079 meta["hashes"] = { 1080 "md5": hashlib.md5(content).hexdigest(), 1081 "sha1": hashlib.sha1(content).hexdigest(), 1082 "sha256": hashlib.sha256(content).hexdigest() 1083 } 1084 1085 # Get extended file attributes where supported 1086 if hasattr(os, 'listxattr'): 1087 try: 1088 xattrs = os.listxattr(path) 1089 if xattrs: 1090 meta["xattrs"] = {} 1091 for attr in xattrs: 1092 meta["xattrs"][attr] = os.getxattr(path, attr) 1093 except (OSError, AttributeError): 1094 pass 1095 1096 # Get EXIF data if available and relevant 1097 exif = extract_exif(path) 1098 if exif: 1099 meta["exif"] = exif 1100 1101 # Get file owner and permissions 1102 import pwd 1103 try: 1104 meta["owner"] = pwd.getpwuid(stats.st_uid).pw_name 1105 except KeyError: 1106 meta["owner"] = str(stats.st_uid) 1107 meta["permissions"] = oct(stats.st_mode)[-3:] 1108 1109 except Exception as e: 1110 meta["error"] = str(e) 1111 1112 return meta
Extract comprehensive metadata from any file type.
Args: file_path (str): Path to target file.
Returns: Dict[str, Any]: Nested metadata structure.
163def text_from_screenshot() -> str: 164 """ 165 Capture a full-screen screenshot, perform OCR, and clean up temp file. 166 167 Returns: 168 str: Normalized OCR-extracted text from the screenshot. 169 """ 170 tmp_filename = f"screenshot_{uuid4().hex}.png" 171 tmp_path = os.path.join(tempfile.gettempdir(), tmp_filename) 172 173 try: 174 with mss() as sct: 175 monitor = {"top": 0, "left": 0, "width": 0, "height": 0} 176 for mon in sct.monitors: 177 monitor["left"] = min(mon["left"], monitor["left"]) 178 monitor["top"] = min(mon["top"], monitor["top"]) 179 monitor["width"] = max(mon["width"] + mon["left"] - monitor["left"], monitor["width"]) 180 monitor["height"] = max(mon["height"] + mon["top"] - monitor["top"], monitor["height"]) 181 screenshot = sct.grab(monitor) 182 183 img = Image.frombytes("RGB", screenshot.size, screenshot.bgra, "raw", "BGRX") 184 img_gray = img.convert("L") 185 img_gray.save(tmp_path) 186 187 content = text_from_image(tmp_path) 188 return normalize_text(content) 189 finally: 190 if os.path.exists(tmp_path): 191 try: 192 os.remove(tmp_path) 193 except Exception as e: 194 logger.error(f"Failed to delete temp screenshot: {e}")
Capture a full-screen screenshot, perform OCR, and clean up temp file.
Returns: str: Normalized OCR-extracted text from the screenshot.
304def text_from_url( 305 url: str, 306 render_js: bool = True 307) -> Optional[str]: 308 """ 309 Fetch and extract all visible text from a web page, including JS-rendered content. 310 311 Args: 312 url (str): Target webpage URL. 313 render_js (bool): Whether to render JavaScript content. 314 315 Returns: 316 Optional[str]: Cleaned full-page text, or None on failure. 317 """ 318 headers = { 319 "User-Agent": random.choice(USER_AGENTS), 320 "Accept-Language": "en-US,en;q=0.9", 321 "Referer": url, 322 "DNT": "1", 323 "Upgrade-Insecure-Requests": "1" 324 } 325 326 # Try with requests-html first (with JS rendering) 327 if render_js: 328 try: 329 session = HTMLSession() 330 try: 331 r = session.get(url, headers=headers, timeout=5) 332 333 # Set shorter timeout for rendering to avoid hanging 334 try: 335 r.html.render(timeout=5, sleep=1, keep_page=True) 336 except Exception as e: 337 logger.warning(f"JS rendering failed, falling back to static HTML: {e}") 338 339 html = r.html.html 340 session.close() 341 content = text_from_html(html) 342 return content 343 except Exception as e: 344 logger.error(f"[Error with HTMLSession] {url} - {e}") 345 session.close() 346 # Fall through to regular requests 347 finally: 348 session.close() 349 except Exception as e: 350 logger.error(f"[Error creating HTMLSession] {e}") 351 # Fall through to regular requests 352 353 # Fall back to regular requests (no JS rendering) 354 try: 355 response = requests.get(url, headers=headers, timeout=10) 356 response.raise_for_status() 357 html = response.text 358 content = text_from_html(html) 359 return content 360 except Exception as e: 361 logger.error(f"[Error with requests] {url} - {e}") 362 return None
Fetch and extract all visible text from a web page, including JS-rendered content.
Args: url (str): Target webpage URL. render_js (bool): Whether to render JavaScript content.
Returns: Optional[str]: Cleaned full-page text, or None on failure.
271def text_from_html(html: str) -> str: 272 """ 273 Extract readable text from raw HTML content. 274 275 Args: 276 html (str): HTML source as a string. 277 278 Returns: 279 str: Cleaned and normalized visible text. 280 """ 281 # Check if the input is a file path or HTML content 282 if os.path.isfile(html): 283 with open(html, 'r', encoding='utf-8', errors='ignore') as f: 284 html = f.read() 285 286 soup = BeautifulSoup(html, "html.parser") 287 288 # Remove non-visible or structural elements 289 for tag in soup([ 290 "script", "style", 291 "noscript", "iframe", 292 "meta", "link", 293 "header", "footer", 294 "form", "nav", 295 "aside" 296 ]): 297 tag.decompose() 298 299 text = soup.get_text(separator=" ") 300 301 return normalize_text(text)
Extract readable text from raw HTML content.
Args: html (str): HTML source as a string.
Returns: str: Cleaned and normalized visible text.
565def text_from_audio( 566 audio_file: str 567) -> Optional[str]: 568 """ 569 Transcribe audio to text using Google Speech Recognition. 570 571 Args: 572 audio_file (str): Path to the input audio file. 573 574 Returns: 575 Optional[str]: Transcription, or None on failure. 576 """ 577 def convert_to_wav(file_path: str) -> str: 578 _, ext = os.path.splitext(file_path) 579 ext = ext.lstrip('.') 580 audio = AudioSegment.from_file(file_path, format=ext) 581 tmp_filename = f"audio_{uuid4().hex}.wav" 582 wav_path = os.path.join(tempfile.gettempdir(), tmp_filename) 583 audio.export(wav_path, format='wav') 584 return wav_path 585 586 recognizer = sr.Recognizer() 587 temp_wav_path = None 588 cleanup_needed = False 589 590 try: 591 _, ext = os.path.splitext(audio_file) 592 if ext.lower() not in ['.wav', '.wave']: 593 temp_wav_path = convert_to_wav(audio_file) 594 cleanup_needed = True 595 else: 596 temp_wav_path = clean_path(audio_file) 597 598 if not temp_wav_path: 599 logger.error("Invalid audio path.") 600 return None 601 602 with sr.AudioFile(temp_wav_path) as source: 603 audio = recognizer.record(source) 604 return recognizer.recognize_google(audio) 605 606 except sr.UnknownValueError: 607 logger.error("Could not understand audio.") 608 except sr.RequestError as e: 609 logger.error(f"Speech recognition error: {e}") 610 except Exception as e: 611 logger.error(f"Failed to process audio: {e}") 612 finally: 613 if cleanup_needed and temp_wav_path and os.path.exists(temp_wav_path): 614 try: 615 os.remove(temp_wav_path) 616 except Exception as e: 617 logger.error(f"Failed to delete temp WAV file {temp_wav_path}: {e}") 618 619 return None
Transcribe audio to text using Google Speech Recognition.
Args: audio_file (str): Path to the input audio file.
Returns: Optional[str]: Transcription, or None on failure.
679def text_from_pdf( 680 pdf_path: str 681) -> Optional[str]: 682 """ 683 Extract text and OCR results from a PDF using PyMuPDF. 684 685 Args: 686 pdf_path (str): Path to PDF file. 687 688 Returns: 689 Optional[str]: Combined normalized text and image OCR results. 690 """ 691 plain_text = "" 692 temp_image_paths: List[str] = [] 693 694 try: 695 doc = pymupdf.open(pdf_path) 696 for k, v in doc.metadata.items(): 697 plain_text += f"{k}: {v}\n" 698 699 for i in range(len(doc)): 700 page = doc.load_page(i) 701 plain_text += f"\n--- Page {i + 1} ---\n" 702 text = page.get_text() 703 plain_text += text or "[No text]\n" 704 705 for img_index, img in enumerate(page.get_images(full=True), start=1): 706 xref = img[0] 707 base = doc.extract_image(xref) 708 img_bytes = base["image"] 709 710 img_filename = f"pdf_page{i+1}_img{img_index}_{uuid4().hex}.png" 711 img_path = os.path.join(tempfile.gettempdir(), img_filename) 712 temp_image_paths.append(img_path) 713 714 with open(img_path, "wb") as f: 715 f.write(img_bytes) 716 717 ocr = text_from_image(img_path) or "" 718 plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n" 719 720 # Extract tables from PDF 721 """ 722 try: 723 tables = extract_tables_from_pdf(pdf_path) 724 if tables: 725 plain_text += "\n--- Tables ---\n" 726 for i, table in enumerate(tables, 1): 727 plain_text += f"\n[Table {i}]\n" 728 if isinstance(table, dict) and "data" in table: 729 for row in table["data"]: 730 plain_text += str(row) + "\n" 731 else: 732 plain_text += str(table) + "\n" 733 except Exception as e: 734 logger.warning(f"Could not extract tables from PDF: {e}") 735 """ 736 737 return normalize_text(plain_text) 738 except Exception as e: 739 logger.error(f"Error processing PDF: {e}") 740 return None 741 finally: 742 for path in temp_image_paths: 743 if os.path.exists(path): 744 try: 745 os.remove(path) 746 except Exception as e: 747 logger.error(f"Failed to delete temp image {path}: {e}") 748 if 'doc' in locals(): 749 doc.close()
Extract text and OCR results from a PDF using PyMuPDF.
Args: pdf_path (str): Path to PDF file.
Returns: Optional[str]: Combined normalized text and image OCR results.
827def text_from_doc( 828 filepath: str, 829 min_length: int = 4 830) -> str: 831 """ 832 Extract readable strings and metadata from binary Word (.doc) files. 833 834 Args: 835 filepath (str): Path to .doc file. 836 min_length (int): Minimum string length to extract. 837 838 Returns: 839 str: Metadata and text content. 840 """ 841 def extract_printable_strings( 842 data: bytes 843 ) -> List[str]: 844 pattern = re.compile( 845 b'[' + re.escape(bytes(string.printable, 'ascii')) + 846 b']{%d,}' % min_length 847 ) 848 found = pattern.findall(data) 849 return list(dict.fromkeys(m.decode(errors='ignore').strip() 850 for m in found)) 851 852 def clean_strings( 853 strs: List[str] 854 ) -> List[str]: 855 cleaned: List[str] = [] 856 skip = ["HYPERLINK", "OLE2", "Normal.dotm"] 857 for line in strs: 858 if any(line.startswith(pref) for pref in skip): 859 continue 860 cleaned.append(re.sub(r'\s+', ' ', line).strip()) 861 return cleaned 862 863 with open(filepath, 'rb') as f: 864 data = f.read() 865 strings = extract_printable_strings(data) 866 strings = clean_strings(strings) 867 content = "\n".join(strings) 868 return normalize_text(content)
Extract readable strings and metadata from binary Word (.doc) files.
Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.
Returns: str: Metadata and text content.
871def text_from_docx( 872 file_path: str 873) -> Optional[str]: 874 """ 875 Extract text, tables, and OCR from embedded images in a DOCX file. 876 877 Args: 878 file_path (str): Path to the .docx file. 879 880 Returns: 881 Optional[str]: Normalized full text content. 882 """ 883 path = clean_path(file_path) 884 if not path: 885 return None 886 887 temp_image_paths: List[str] = [] 888 plain_text = "" 889 890 try: 891 doc = Document(path) 892 893 for p in doc.paragraphs: 894 if p.text.strip(): 895 plain_text += p.text.strip() + "\n" 896 897 for tbl in doc.tables: 898 plain_text += "\n[Table]\n" 899 for row in tbl.rows: 900 row_text = "\t".join(c.text.strip() for c in row.cells) 901 plain_text += row_text + "\n" 902 903 for rel_id, rel in doc.part.rels.items(): 904 if "image" in rel.target_ref: 905 blob = rel.target_part.blob 906 907 img_filename = f"docx_img_{rel_id}_{uuid4().hex}.png" 908 img_path = os.path.join(tempfile.gettempdir(), img_filename) 909 temp_image_paths.append(img_path) 910 911 with open(img_path, "wb") as img_file: 912 img_file.write(blob) 913 914 ocr = text_from_image(img_path) or "" 915 plain_text += f"\n[Image OCR]\n{ocr}\n" 916 917 return normalize_text(plain_text) 918 919 except Exception as e: 920 logger.error(f"Error processing DOCX: {e}") 921 return None 922 finally: 923 for path in temp_image_paths: 924 if os.path.exists(path): 925 try: 926 os.remove(path) 927 except Exception as e: 928 logger.error(f"Failed to delete temp DOCX image {path}: {e}")
Extract text, tables, and OCR from embedded images in a DOCX file.
Args: file_path (str): Path to the .docx file.
Returns: Optional[str]: Normalized full text content.
931def text_from_excel( 932 file_path: str 933) -> str: 934 """ 935 Convert an Excel workbook to CSV text. 936 937 Args: 938 file_path (str): Path to the Excel file. 939 940 Returns: 941 str: CSV-formatted string. 942 """ 943 path = clean_path(file_path) 944 if not path: 945 return "" 946 try: 947 # Get all sheets 948 result = "" 949 excel_file = pd.ExcelFile(path) 950 for sheet_name in excel_file.sheet_names: 951 df = pd.read_excel(path, sheet_name=sheet_name) 952 out = StringIO() 953 df.to_csv(out, index=False) 954 result += f"\n--- Sheet: {sheet_name} ---\n" 955 result += out.getvalue() 956 result += "\n" 957 return result 958 except Exception as e: 959 logger.error(f"Failed Excel -> CSV: {e}") 960 return ""
Convert an Excel workbook to CSV text.
Args: file_path (str): Path to the Excel file.
Returns: str: CSV-formatted string.
963def text_from_image( 964 file_path: str 965) -> Optional[str]: 966 """ 967 Perform OCR on an image file. 968 969 Args: 970 file_path (str): Path to the image. 971 972 Returns: 973 Optional[str]: Extracted text, or None on error. 974 """ 975 path = clean_path(file_path) 976 if not path: 977 return None 978 try: 979 with Image.open(path) as img: 980 # Improve OCR with preprocessing 981 # 1. Convert to grayscale if it's not already 982 if img.mode != 'L': 983 img = img.convert('L') 984 985 # 2. Optional: Apply some contrast enhancement 986 # (Disabled by default, enable if needed for specific cases) 987 # from PIL import ImageEnhance 988 # enhancer = ImageEnhance.Contrast(img) 989 # img = enhancer.enhance(1.5) # Increase contrast 990 991 # Perform OCR with custom configuration 992 custom_config = r'--oem 3 --psm 6' # Default OCR Engine Mode and Page Segmentation Mode 993 txt = pytesseract.image_to_string(img, config=custom_config).strip() 994 return normalize_text(txt) or "" 995 except Exception as e: 996 logger.error(f"Failed image OCR: {e}") 997 return None
Perform OCR on an image file.
Args: file_path (str): Path to the image.
Returns: Optional[str]: Extracted text, or None on error.
1000def text_from_any( 1001 file_path: str 1002) -> Optional[str]: 1003 """ 1004 Handle unknown file types by reporting stats and metadata. 1005 1006 Args: 1007 file_path (str): Path to the file. 1008 1009 Returns: 1010 Optional[str]: Plain-text report, or None on error. 1011 """ 1012 path = clean_path(file_path) 1013 if not path: 1014 return None 1015 try: 1016 stats = os.stat(path) 1017 info = { 1018 "path": path, 1019 "size": stats.st_size, 1020 "created": datetime.fromtimestamp(stats.st_ctime).isoformat(), 1021 "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(), 1022 } 1023 1024 # Try to extract EXIF if available 1025 exif = extract_exif(path) 1026 if exif: 1027 info["exif"] = exif 1028 1029 # Get file hash 1030 md5_hash = hashlib.md5(open(path,'rb').read()).hexdigest() 1031 info["md5"] = md5_hash 1032 1033 content = "\n".join(f"{k}: {v}" for k, v in info.items() if k != "exif") 1034 1035 # Add formatted EXIF data if available 1036 if exif: 1037 content += "\n\nEXIF Data:\n" 1038 for k, v in exif.items(): 1039 if isinstance(v, dict): 1040 content += f"\n{k}:\n" 1041 for sub_k, sub_v in v.items(): 1042 content += f" {sub_k}: {sub_v}\n" 1043 else: 1044 content += f"{k}: {v}\n" 1045 1046 return normalize_text(content) 1047 except Exception as e: 1048 logger.error(f"Error on other file: {e}") 1049 return None
Handle unknown file types by reporting stats and metadata.
Args: file_path (str): Path to the file.
Returns: Optional[str]: Plain-text report, or None on error.
1418def text_from_odt(odt_path: str) -> Optional[str]: 1419 """ 1420 Extract text from OpenDocument Text files. 1421 1422 Args: 1423 odt_path (str): Path to the ODT file 1424 1425 Returns: 1426 Optional[str]: Extracted text 1427 """ 1428 try: 1429 from odf import text, teletype 1430 from odf.opendocument import load 1431 1432 textdoc = load(odt_path) 1433 1434 # Extract metadata 1435 meta = [] 1436 meta_elem = textdoc.meta 1437 if meta_elem: 1438 for prop in meta_elem.childNodes: 1439 if hasattr(prop, 'tagName') and hasattr(prop, 'childNodes') and prop.childNodes: 1440 meta.append(f"{prop.tagName}: {teletype.extractText(prop)}") 1441 1442 # Extract content 1443 allparas = textdoc.getElementsByType(text.P) 1444 content = "\n".join(teletype.extractText(p) for p in allparas) 1445 1446 # Combine metadata and content 1447 if meta: 1448 final_text = "\n".join(meta) + "\n---\n" + content 1449 else: 1450 final_text = content 1451 1452 return normalize_text(final_text) 1453 except ImportError: 1454 logger.error("odfpy not installed") 1455 return "odfpy package is required for ODT processing" 1456 except Exception as e: 1457 logger.error(f"Error processing ODT: {e}") 1458 return None
Extract text from OpenDocument Text files.
Args: odt_path (str): Path to the ODT file
Returns: Optional[str]: Extracted text
1375def text_from_pptx(pptx_path: str) -> Optional[str]: 1376 """ 1377 Extract text from PowerPoint presentations. 1378 1379 Args: 1380 pptx_path (str): Path to the PowerPoint file 1381 1382 Returns: 1383 Optional[str]: Extracted text 1384 """ 1385 try: 1386 from pptx import Presentation 1387 1388 prs = Presentation(pptx_path) 1389 text = ["--- PowerPoint Presentation ---"] 1390 1391 for i, slide in enumerate(prs.slides, 1): 1392 slide_text = [f"Slide {i}:"] 1393 1394 # Get slide title if it exists 1395 if slide.shapes.title and slide.shapes.title.text: 1396 slide_text.append(f"Title: {slide.shapes.title.text}") 1397 1398 # Extract text from all shapes 1399 shape_text = [] 1400 for shape in slide.shapes: 1401 if hasattr(shape, "text") and shape.text: 1402 shape_text.append(shape.text) 1403 1404 if shape_text: 1405 slide_text.append("\n".join(shape_text)) 1406 1407 text.append("\n".join(slide_text)) 1408 1409 return normalize_text("\n\n".join(text)) 1410 except ImportError: 1411 logger.error("python-pptx not installed") 1412 return "python-pptx package is required for PowerPoint processing" 1413 except Exception as e: 1414 logger.error(f"Error processing PowerPoint: {e}") 1415 return None
Extract text from PowerPoint presentations.
Args: pptx_path (str): Path to the PowerPoint file
Returns: Optional[str]: Extracted text
1328def text_from_epub(epub_path: str) -> Optional[str]: 1329 """ 1330 Extract text from EPUB ebooks. 1331 1332 Args: 1333 epub_path (str): Path to the EPUB file 1334 1335 Returns: 1336 Optional[str]: Extracted text 1337 """ 1338 try: 1339 from ebooklib import epub 1340 import html2text 1341 1342 book = epub.read_epub(epub_path) 1343 h = html2text.HTML2Text() 1344 h.ignore_links = False 1345 1346 content = [] 1347 1348 # Get book metadata 1349 metadata = [] 1350 if book.get_metadata('DC', 'title'): 1351 metadata.append(f"Title: {book.get_metadata('DC', 'title')[0][0]}") 1352 if book.get_metadata('DC', 'creator'): 1353 metadata.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}") 1354 if book.get_metadata('DC', 'description'): 1355 metadata.append(f"Description: {book.get_metadata('DC', 'description')[0][0]}") 1356 1357 if metadata: 1358 content.append("\n".join(metadata)) 1359 content.append("---") 1360 1361 # Get book content 1362 for item in book.get_items(): 1363 if item.get_type() == epub.ITEM_DOCUMENT: 1364 content.append(h.handle(item.get_content().decode('utf-8'))) 1365 1366 return normalize_text("\n".join(content)) 1367 except ImportError: 1368 logger.error("ebooklib and/or html2text not installed") 1369 return "ebooklib and/or html2text packages are required for EPUB processing" 1370 except Exception as e: 1371 logger.error(f"Error processing EPUB: {e}") 1372 return None
Extract text from EPUB ebooks.
Args: epub_path (str): Path to the EPUB file
Returns: Optional[str]: Extracted text
1284def analyze_text(text: str) -> Dict[str, Any]: 1285 """ 1286 Perform basic text analytics. 1287 1288 Args: 1289 text (str): Input text 1290 1291 Returns: 1292 Dict: Analysis results 1293 """ 1294 try: 1295 # Tokenize text 1296 words = nltk.word_tokenize(text.lower()) 1297 sentences = nltk.sent_tokenize(text) 1298 1299 # Filter out punctuation 1300 words = [word for word in words if word.isalpha()] 1301 1302 # Count word frequencies 1303 word_freq = Counter(words) 1304 1305 # Calculate readability metrics 1306 avg_word_length = sum(len(word) for word in words) / len(words) if words else 0 1307 avg_sent_length = len(words) / len(sentences) if sentences else 0 1308 1309 # Detect language 1310 language = detect_language(text) 1311 1312 return { 1313 "word_count": len(words), 1314 "sentence_count": len(sentences), 1315 "unique_words": len(set(words)), 1316 "avg_word_length": avg_word_length, 1317 "avg_sentence_length": avg_sent_length, 1318 "language": language, 1319 "most_common_words": word_freq.most_common(20) 1320 } 1321 except Exception as e: 1322 logger.error(f"Text analysis error: {e}") 1323 return {"error": str(e)}
Perform basic text analytics.
Args: text (str): Input text
Returns: Dict: Analysis results
1216def summarize_text(text: str, sentences: int = 5) -> str: 1217 """ 1218 Create a simple extractive summary from the text. 1219 1220 Args: 1221 text (str): Input text to summarize 1222 sentences (int): Number of sentences to include 1223 1224 Returns: 1225 str: Summarized text 1226 """ 1227 try: 1228 from nltk.corpus import stopwords 1229 from nltk.tokenize import sent_tokenize 1230 1231 # Download required NLTK data if not already present 1232 try: 1233 nltk.data.find('tokenizers/punkt') 1234 except LookupError: 1235 nltk.download('punkt', quiet=True) 1236 try: 1237 nltk.data.find('corpora/stopwords') 1238 except LookupError: 1239 nltk.download('stopwords', quiet=True) 1240 1241 # Tokenize and calculate word frequencies 1242 stop_words = set(stopwords.words('english')) 1243 sentences_list = sent_tokenize(text) 1244 1245 # If there are fewer sentences than requested, return all 1246 if len(sentences_list) <= sentences: 1247 return text 1248 1249 word_frequencies = {} 1250 for sentence in sentences_list: 1251 for word in nltk.word_tokenize(sentence): 1252 word = word.lower() 1253 if word not in stop_words: 1254 if word not in word_frequencies: 1255 word_frequencies[word] = 1 1256 else: 1257 word_frequencies[word] += 1 1258 1259 # Normalize frequencies 1260 maximum_frequency = max(word_frequencies.values()) if word_frequencies else 1 1261 for word in word_frequencies: 1262 word_frequencies[word] = word_frequencies[word] / maximum_frequency 1263 1264 # Score sentences 1265 sentence_scores = {} 1266 for i, sentence in enumerate(sentences_list): 1267 for word in nltk.word_tokenize(sentence.lower()): 1268 if word in word_frequencies: 1269 if i not in sentence_scores: 1270 sentence_scores[i] = word_frequencies[word] 1271 else: 1272 sentence_scores[i] += word_frequencies[word] 1273 1274 # Get top N sentences 1275 summary_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:sentences] 1276 summary_sentences = [sentences_list[i] for i, _ in sorted(summary_sentences)] 1277 1278 return ' '.join(summary_sentences) 1279 except Exception as e: 1280 logger.error(f"Summarization error: {e}") 1281 return text
Create a simple extractive summary from the text.
Args: text (str): Input text to summarize sentences (int): Number of sentences to include
Returns: str: Summarized text
1162def translate_text(text: str, target_lang: str = "en") -> Optional[str]: 1163 """ 1164 Translate text to target language. 1165 1166 Args: 1167 text (str): Input text to translate 1168 target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese) 1169 1170 Returns: 1171 Optional[str]: Translated text or None on failure 1172 """ 1173 try: 1174 # Use a more stable translation library 1175 # Note: googletrans 4.0.0-rc1 uses async methods which need to be awaited 1176 # Let's use the deep-translator library instead which is more stable 1177 from deep_translator import GoogleTranslator 1178 1179 # Handle long texts by splitting into chunks (Google has a limit) 1180 max_chunk_size = 4500 # Google Translate has a limit around 5000 chars 1181 chunks = [] 1182 1183 # Split text into chunks of appropriate size (at sentence boundaries if possible) 1184 text_remaining = text 1185 while len(text_remaining) > 0: 1186 if len(text_remaining) <= max_chunk_size: 1187 chunks.append(text_remaining) 1188 break 1189 1190 # Try to find a sentence boundary near the max chunk size 1191 chunk_end = max_chunk_size 1192 while chunk_end > 0 and text_remaining[chunk_end] not in ['.', '!', '?', '\n']: 1193 chunk_end -= 1 1194 1195 # If no good sentence boundary found, just use max size 1196 if chunk_end == 0: 1197 chunk_end = max_chunk_size 1198 else: 1199 chunk_end += 1 # Include the period or boundary character 1200 1201 chunks.append(text_remaining[:chunk_end]) 1202 text_remaining = text_remaining[chunk_end:] 1203 1204 # Translate each chunk and combine 1205 translated_chunks = [] 1206 for chunk in chunks: 1207 translated_chunk = GoogleTranslator(source='auto', target=target_lang).translate(chunk) 1208 translated_chunks.append(translated_chunk) 1209 1210 return ' '.join(translated_chunks) 1211 except Exception as e: 1212 logger.error(f"Translation error: {e}") 1213 return None
Translate text to target language.
Args: text (str): Input text to translate target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese)
Returns: Optional[str]: Translated text or None on failure
1134def list_available_languages() -> Dict[str, str]: 1135 """ 1136 Get a dictionary of available languages for translation. 1137 1138 Returns: 1139 Dict[str, str]: Dictionary mapping language codes to language names 1140 """ 1141 try: 1142 from deep_translator import GoogleTranslator 1143 # Get available languages from the translator 1144 languages = GoogleTranslator().get_supported_languages(as_dict=True) 1145 return languages 1146 except Exception as e: 1147 logger.error(f"Error getting language list: {e}") 1148 # Return a small subset as fallback 1149 return { 1150 "en": "English", 1151 "es": "Spanish", 1152 "fr": "French", 1153 "de": "German", 1154 "it": "Italian", 1155 "ja": "Japanese", 1156 "ko": "Korean", 1157 "zh-cn": "Chinese (Simplified)", 1158 "ru": "Russian", 1159 "ar": "Arabic" 1160 }
Get a dictionary of available languages for translation.
Returns: Dict[str, str]: Dictionary mapping language codes to language names
1117def detect_language(text: str) -> str: 1118 """ 1119 Detect the language of the extracted text. 1120 1121 Args: 1122 text (str): Input text 1123 1124 Returns: 1125 str: Detected language code or 'unknown' 1126 """ 1127 try: 1128 import langdetect 1129 return langdetect.detect(text) 1130 except: 1131 logger.warning("Language detection failed or langdetect not installed") 1132 return "unknown"
Detect the language of the extracted text.
Args: text (str): Input text
Returns: str: Detected language code or 'unknown'
365def scrape_website(url: str, max_pages: int = 1, stay_on_domain: bool = True) -> Dict[str, str]: 366 """ 367 Scrape multiple pages of a website. 368 369 Args: 370 url (str): Starting URL 371 max_pages (int): Maximum pages to scrape 372 stay_on_domain (bool): Whether to stay on the same domain 373 374 Returns: 375 Dict[str, str]: Dictionary mapping URLs to extracted text 376 """ 377 results = {} 378 visited = set() 379 to_visit = [url] 380 base_domain = urlparse(url).netloc 381 382 while to_visit and len(visited) < max_pages: 383 current_url = to_visit.pop(0) 384 if current_url in visited: 385 continue 386 387 # Extract text from current page 388 text = text_from_url(current_url) 389 if text: 390 results[current_url] = text 391 392 visited.add(current_url) 393 394 # Find links on the page 395 session = HTMLSession() 396 try: 397 r = session.get(current_url) 398 r.html.render(timeout=20, sleep=1) 399 400 links = r.html.absolute_links 401 for link in links: 402 link_domain = urlparse(link).netloc 403 if link not in visited and link not in to_visit: 404 # Check if we should follow this link 405 if stay_on_domain and link_domain != base_domain: 406 continue 407 to_visit.append(link) 408 except Exception as e: 409 logger.error(f"Error scraping {current_url}: {e}") 410 finally: 411 session.close() 412 413 return results
Scrape multiple pages of a website.
Args: url (str): Starting URL max_pages (int): Maximum pages to scrape stay_on_domain (bool): Whether to stay on the same domain
Returns: Dict[str, str]: Dictionary mapping URLs to extracted text
124def normalize_text( 125 text: str 126) -> str: 127 """ 128 Replace multiple consecutive newlines, carriage returns, and spaces 129 with a single space. Ensures compact, single-line output. 130 131 Args: 132 text (str): Raw input text. 133 134 Returns: 135 str: Normalized single-line text. 136 """ 137 if not text: 138 return "" 139 text = unicodedata.normalize("NFKC", text) 140 text = re.sub(r' +', ' ', text) 141 text = re.sub(r'\n+', '\n', text) 142 text = re.sub(r'(?m)(^ \n)+', '\n', text) 143 text = re.sub(r'\t+', '\t', text) 144 text = re.sub(r'\r+', '\n', text) 145 text = re.sub(r"^ ", "", text, flags=re.MULTILINE) 146 return text
Replace multiple consecutive newlines, carriage returns, and spaces with a single space. Ensures compact, single-line output.
Args: text (str): Raw input text.
Returns: str: Normalized single-line text.