mrblack
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# 4# File: __init__.py 5# Author: Wadih Khairallah 6# Description: 7# Created: 2025-05-12 16:47:22 8# Modified: 2025-05-16 16:38:12 9 10from .pii import ( 11 extract_pii_text, 12 extract_pii_file, 13 extract_pii_url, 14 extract_pii_image, 15 extract_pii_screenshot 16) 17from .textextract import ( 18 extract_text, 19 extract_text_with_password, 20 extract_exif, 21 extract_strings, 22 extract_metadata, 23 text_from_screenshot, 24 text_from_url, 25 text_from_html, 26 text_from_audio, 27 text_from_pdf, 28 text_from_doc, 29 text_from_docx, 30 text_from_excel, 31 text_from_image, 32 text_from_any, 33 text_from_odt, 34 text_from_pptx, 35 text_from_epub, 36 analyze_text, 37 summarize_text, 38 translate_text, 39 list_available_languages, 40 detect_language, 41 scrape_website, 42 normalize_text, 43 44) 45 46__all__ = [ 47 "extract_pii_text", 48 "extract_pii_file", 49 "extract_pii_url", 50 "extract_pii_image", 51 "extract_pii_screenshot", 52 "extract_text_with_password", 53 "extract_text", 54 "extract_exif", 55 "extract_metadata", 56 "extract_strings", 57 "text_from_screenshot", 58 "text_from_url", 59 "text_from_html", 60 "text_from_audio", 61 "text_from_pdf", 62 "text_from_doc", 63 "text_from_docx", 64 "text_from_excel", 65 "text_from_image", 66 "text_from_any", 67 "text_from_odt", 68 "text_from_pptx", 69 "text_from_epub", 70 "analyze_text", 71 "summarize_text", 72 "translate_text", 73 "list_available_languages", 74 "detect_language", 75 "scrape_website", 76 "normalize_text" 77]
71def extract_pii_text( 72 text: str, 73 labels: Optional[Union[List[str], str]] = None 74) -> Dict[str, List[str]]: 75 """ 76 Extract PII matches from provided text. 77 78 Args: 79 text (str): The input text to scan for patterns. 80 labels (Optional[Union[List[str], str]]): Specific labels to filter on. 81 82 Returns: 83 Dict[str, List[str]]: Mapping of each label to a sorted list of 84 matched and cleaned strings. 85 """ 86 if isinstance(labels, str): 87 labels = [labels] 88 patterns = PATTERNS 89 if labels: 90 patterns = [ 91 p for p in PATTERNS 92 if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels) 93 ] 94 results: Dict[str, set] = defaultdict(set) 95 for pattern in patterns: 96 try: 97 rx = re.compile(pattern) 98 for m in rx.finditer(text): 99 for lbl, val in m.groupdict().items(): 100 if not val: 101 continue 102 cleaned = _clean_value(lbl, val) 103 if lbl == "url": 104 cleaned = cleaned.rstrip("),.**") 105 if cleaned is not None: 106 results[lbl].add(cleaned) 107 except re.error as e: 108 print( 109 f"Invalid regex skipped: {pattern} → {e}", 110 file=sys.stderr 111 ) 112 return {lbl: sorted(vals) for lbl, vals in results.items()}
Extract PII matches from provided text.
Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.
Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.
115def extract_pii_file( 116 file_path: str, 117 labels: Optional[Union[List[str], str]] = None 118) -> Optional[Dict[str, List[str]]]: 119 """ 120 Extract PII from a single file's text content. 121 122 Args: 123 file_path (str): Path to the file. 124 labels (Optional[Union[List[str], str]]): Labels to filter. 125 126 Returns: 127 Optional[Dict[str, List[str]]]: Extraction results, or None. 128 """ 129 text = extract_text(file_path) 130 if not text: 131 return None 132 data = extract_pii_text(text, labels) 133 return data or None
Extract PII from a single file's text content.
Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
136def extract_pii_url( 137 path: str, 138 labels: Optional[Union[List[str], str]] = None 139) -> Optional[Dict[str, List[str]]]: 140 """ 141 Extract PII from the text at a URL. 142 143 Args: 144 path (str): The URL to fetch. 145 labels (Optional[Union[List[str], str]]): Labels to filter. 146 147 Returns: 148 Optional[Dict[str, List[str]]]: Extraction results, or None. 149 """ 150 text = text_from_url(path) 151 if not text: 152 return None 153 data = extract_pii_text(text, labels) 154 return data or None
Extract PII from the text at a URL.
Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
157def extract_pii_image( 158 image_path: str, 159 labels: Optional[Union[List[str], str]] = None 160) -> Optional[Dict[str, List[str]]]: 161 """ 162 Extract PII from an image using OCR. 163 164 Args: 165 image_path (str): Path to the image file. 166 labels (Optional[Union[List[str], str]]): Labels to filter. 167 168 Returns: 169 Optional[Dict[str, List[str]]]: Extraction results, or None. 170 """ 171 path = clean_path(image_path) 172 if not path or not os.path.isfile(path): 173 print(f"[red]Invalid image path:[/] {image_path}") 174 return None 175 text = extract_text(path) 176 if not text: 177 return None 178 data = extract_pii_text(text, labels) 179 return data or None
Extract PII from an image using OCR.
Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
182def extract_pii_screenshot( 183 labels: Optional[Union[List[str], str]] = None 184) -> Optional[Dict[str, List[str]]]: 185 """ 186 Capture a screenshot and extract PII from its OCR text. 187 188 Args: 189 labels (Optional[Union[List[str], str]]): Labels to filter. 190 191 Returns: 192 Optional[Dict[str, List[str]]]: Extraction results, or None. 193 """ 194 text = text_from_screenshot() 195 if not text: 196 return None 197 data = extract_pii_text(text, labels) 198 return data or None
Capture a screenshot and extract PII from its OCR text.
Args: labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
503def extract_text_with_password(file_path: str, password: str) -> Optional[str]: 504 """ 505 Extract text from password-protected files. 506 507 Args: 508 file_path (str): Path to the file 509 password (str): Password to unlock the file 510 511 Returns: 512 Optional[str]: Extracted text 513 """ 514 file_ext = os.path.splitext(file_path)[1].lower() 515 516 if file_ext == '.pdf': 517 return text_from_pdf_protected(file_path, password) 518 elif file_ext in ['.docx', '.xlsx', '.pptx']: 519 return text_from_office_protected(file_path, password) 520 else: 521 logger.warning(f"Password protection not supported for {file_ext} files") 522 return None
Extract text from password-protected files.
Args: file_path (str): Path to the file password (str): Password to unlock the file
Returns: Optional[str]: Extracted text
417def extract_text( 418 file_path: str 419) -> Optional[str]: 420 """ 421 Extract text content from a local file or URL. 422 423 Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio. 424 425 Args: 426 file_path (str): Path to the input file or URL. 427 428 Returns: 429 Optional[str]: Extracted text, or None if unsupported or error. 430 """ 431 if is_url(file_path): 432 return text_from_url(file_path) 433 434 TEXT_MIME_TYPES = { 435 "application/json", "application/xml", "application/x-yaml", 436 "application/x-toml", "application/x-csv", "application/x-markdown", 437 } 438 439 path = clean_path(file_path) 440 if not path: 441 logger.error(f"No such file: {file_path}") 442 return None 443 444 mime_type = magic.from_file(path, mime=True) 445 try: 446 if mime_type.startswith("text/html"): 447 content = text_from_html(path) 448 return content 449 450 elif mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES: 451 with open(path, 'r', encoding='utf-8', errors='ignore') as f: 452 content = f.read() 453 return normalize_text(content) 454 455 elif mime_type in [ 456 "application/vnd.ms-excel", 457 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 458 ]: 459 content = text_from_excel(path) 460 return content 461 462 elif mime_type == "application/pdf": 463 content = text_from_pdf(path) 464 return content 465 466 elif mime_type == \ 467 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 468 content = text_from_docx(path) 469 return content 470 471 elif mime_type == "application/msword": 472 content = text_from_doc(path) 473 return content 474 475 elif mime_type.startswith("image/"): 476 content = text_from_image(path) 477 return content 478 479 elif mime_type.startswith("audio/"): 480 content = text_from_audio(path) 481 return content 482 483 elif mime_type == "application/epub+zip": 484 content = text_from_epub(path) 485 return content 486 487 elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": 488 content = text_from_pptx(path) 489 return content 490 491 elif mime_type == "application/vnd.oasis.opendocument.text": 492 content = text_from_odt(path) 493 return content 494 495 else: 496 content = text_from_any(path) 497 return content 498 except Exception as e: 499 logger.error(f"Error reading {path}: {e}") 500 return None
Extract text content from a local file or URL.
Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
Args: file_path (str): Path to the input file or URL.
Returns: Optional[str]: Extracted text, or None if unsupported or error.
198def extract_exif( 199 file_path: str 200) -> Optional[Dict[str, Any]]: 201 """ 202 Extract EXIF metadata from a file using exiftool. 203 204 Args: 205 file_path (str): Path to the target file. 206 207 Returns: 208 Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure. 209 """ 210 exif_data: Optional[Dict[str, Any]] = None 211 try: 212 result = subprocess.run( 213 ['exiftool', '-j', file_path], 214 stdout=subprocess.PIPE, 215 stderr=subprocess.PIPE 216 ) 217 if result.returncode == 0: 218 exif_data = json.loads(result.stdout.decode())[0] 219 except Exception as e: 220 logger.error(f"Exiftool failed: {e}") 221 return exif_data
Extract EXIF metadata from a file using exiftool.
Args: file_path (str): Path to the target file.
Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
1178def extract_metadata( 1179 file_path: str 1180) -> Dict[str, Any]: 1181 """ 1182 Extract comprehensive metadata from any file type. 1183 1184 Args: 1185 file_path (str): Path to target file. 1186 1187 Returns: 1188 Dict[str, Any]: Nested metadata structure. 1189 """ 1190 path = clean_path(file_path) 1191 if not path: 1192 return {"error": "File not found"} 1193 1194 meta: Dict[str, Any] = {} 1195 try: 1196 stats = os.stat(path) 1197 meta["size_bytes"] = stats.st_size 1198 meta["created"] = datetime.fromtimestamp(stats.st_ctime).isoformat() 1199 meta["modified"] = datetime.fromtimestamp(stats.st_mtime).isoformat() 1200 meta["mime"] = magic.from_file(path, mime=True) 1201 1202 # Calculate multiple hash types 1203 with open(path, 'rb') as f: 1204 content = f.read() 1205 meta["hashes"] = { 1206 "md5": hashlib.md5(content).hexdigest(), 1207 "sha1": hashlib.sha1(content).hexdigest(), 1208 "sha256": hashlib.sha256(content).hexdigest() 1209 } 1210 1211 # Get extended file attributes where supported 1212 if hasattr(os, 'listxattr'): 1213 try: 1214 xattrs = os.listxattr(path) 1215 if xattrs: 1216 meta["xattrs"] = {} 1217 for attr in xattrs: 1218 meta["xattrs"][attr] = os.getxattr(path, attr) 1219 except (OSError, AttributeError): 1220 pass 1221 1222 # Get EXIF data if available and relevant 1223 exif = extract_exif(path) 1224 if exif: 1225 meta["exif"] = exif 1226 1227 # Get file owner and permissions 1228 try: 1229 meta["owner"] = pwd.getpwuid(stats.st_uid).pw_name 1230 except KeyError: 1231 meta["owner"] = str(stats.st_uid) 1232 meta["permissions"] = oct(stats.st_mode)[-3:] 1233 1234 except Exception as e: 1235 meta["error"] = str(e) 1236 1237 return meta
Extract comprehensive metadata from any file type.
Args: file_path (str): Path to target file.
Returns: Dict[str, Any]: Nested metadata structure.
1008def extract_strings(file_path, min_length=4): 1009 """ 1010 Extract printable strings from a file, similar to the Unix 'strings' command. 1011 1012 Args: 1013 file_path (str): Path to the file to extract strings from 1014 min_length (int, optional): Minimum length of strings to extract. Defaults to 4. 1015 1016 Returns: 1017 list: List of printable strings found in the file 1018 """ 1019 import string 1020 file_path = clean_path(file_path) 1021 1022 1023 # Define printable characters (excluding tabs and newlines) 1024 printable_chars = set(string.printable) - set('\t\n\r\v\f') 1025 1026 result = [] 1027 current_string = "" 1028 1029 # Read the file in binary mode 1030 try: 1031 with open(file_path, 'rb') as file: 1032 # Read the file byte by byte 1033 for byte in file.read(): 1034 # Convert byte to character 1035 char = chr(byte) 1036 1037 # If character is printable, add to current string 1038 if char in printable_chars: 1039 current_string += char 1040 # If not printable and we have a string of minimum length, add to results 1041 elif len(current_string) >= min_length: 1042 if current_string == "Sj[d": 1043 pass 1044 else: 1045 result.append(current_string) 1046 current_string = "" 1047 # If not printable and current string is too short, reset current string 1048 else: 1049 current_string = "" 1050 1051 # Don't forget to add the last string if it meets the minimum length 1052 if len(current_string) >= min_length: 1053 result.append(current_string) 1054 1055 return result 1056 except FileNotFoundError: 1057 print(f"Error: File '{file_path}' not found.", file=sys.stderr) 1058 return None 1059 except Exception as e: 1060 print(f"Error: {e}", file=sys.stderr) 1061 return None
Extract printable strings from a file, similar to the Unix 'strings' command.
Args: file_path (str): Path to the file to extract strings from min_length (int, optional): Minimum length of strings to extract. Defaults to 4.
Returns: list: List of printable strings found in the file
164def text_from_screenshot() -> str: 165 """ 166 Capture a full-screen screenshot, perform OCR, and clean up temp file. 167 168 Returns: 169 str: Normalized OCR-extracted text from the screenshot. 170 """ 171 tmp_filename = f"screenshot_{uuid4().hex}.png" 172 tmp_path = os.path.join(tempfile.gettempdir(), tmp_filename) 173 174 try: 175 with mss() as sct: 176 monitor = {"top": 0, "left": 0, "width": 0, "height": 0} 177 for mon in sct.monitors: 178 monitor["left"] = min(mon["left"], monitor["left"]) 179 monitor["top"] = min(mon["top"], monitor["top"]) 180 monitor["width"] = max(mon["width"] + mon["left"] - monitor["left"], monitor["width"]) 181 monitor["height"] = max(mon["height"] + mon["top"] - monitor["top"], monitor["height"]) 182 screenshot = sct.grab(monitor) 183 184 img = Image.frombytes("RGB", screenshot.size, screenshot.bgra, "raw", "BGRX") 185 img_gray = img.convert("L") 186 img_gray.save(tmp_path) 187 188 content = text_from_image(tmp_path) 189 return normalize_text(content) 190 finally: 191 if os.path.exists(tmp_path): 192 try: 193 os.remove(tmp_path) 194 except Exception as e: 195 logger.error(f"Failed to delete temp screenshot: {e}")
Capture a full-screen screenshot, perform OCR, and clean up temp file.
Returns: str: Normalized OCR-extracted text from the screenshot.
305def text_from_url( 306 url: str, 307 render_js: bool = True 308) -> Optional[str]: 309 """ 310 Fetch and extract all visible text from a web page, including JS-rendered content. 311 312 Args: 313 url (str): Target webpage URL. 314 render_js (bool): Whether to render JavaScript content. 315 316 Returns: 317 Optional[str]: Cleaned full-page text, or None on failure. 318 """ 319 headers = { 320 "User-Agent": random.choice(USER_AGENTS), 321 "Accept-Language": "en-US,en;q=0.9", 322 "Referer": url, 323 "DNT": "1", 324 "Upgrade-Insecure-Requests": "1" 325 } 326 327 # Try with requests-html first (with JS rendering) 328 if render_js: 329 try: 330 session = HTMLSession() 331 try: 332 r = session.get(url, headers=headers, timeout=5) 333 334 # Set shorter timeout for rendering to avoid hanging 335 try: 336 r.html.render(timeout=5, sleep=1, keep_page=True) 337 except Exception as e: 338 logger.warning(f"JS rendering failed, falling back to static HTML: {e}") 339 340 html = r.html.html 341 session.close() 342 content = text_from_html(html) 343 return content 344 except Exception as e: 345 logger.error(f"[Error with HTMLSession] {url} - {e}") 346 session.close() 347 # Fall through to regular requests 348 finally: 349 session.close() 350 except Exception as e: 351 logger.error(f"[Error creating HTMLSession] {e}") 352 # Fall through to regular requests 353 354 # Fall back to regular requests (no JS rendering) 355 try: 356 response = requests.get(url, headers=headers, timeout=10) 357 response.raise_for_status() 358 html = response.text 359 content = text_from_html(html) 360 return content 361 except Exception as e: 362 logger.error(f"[Error with requests] {url} - {e}") 363 return None
Fetch and extract all visible text from a web page, including JS-rendered content.
Args: url (str): Target webpage URL. render_js (bool): Whether to render JavaScript content.
Returns: Optional[str]: Cleaned full-page text, or None on failure.
272def text_from_html(html: str) -> str: 273 """ 274 Extract readable text from raw HTML content. 275 276 Args: 277 html (str): HTML source as a string. 278 279 Returns: 280 str: Cleaned and normalized visible text. 281 """ 282 # Check if the input is a file path or HTML content 283 if os.path.isfile(html): 284 with open(html, 'r', encoding='utf-8', errors='ignore') as f: 285 html = f.read() 286 287 soup = BeautifulSoup(html, "html.parser") 288 289 # Remove non-visible or structural elements 290 for tag in soup([ 291 "script", "style", 292 "noscript", "iframe", 293 "meta", "link", 294 "header", "footer", 295 "form", "nav", 296 "aside" 297 ]): 298 tag.decompose() 299 300 text = soup.get_text(separator=" ") 301 302 return normalize_text(text)
Extract readable text from raw HTML content.
Args: html (str): HTML source as a string.
Returns: str: Cleaned and normalized visible text.
566def text_from_audio( 567 audio_file: str 568) -> Optional[str]: 569 """ 570 Transcribe audio to text using Google Speech Recognition. 571 572 Args: 573 audio_file (str): Path to the input audio file. 574 575 Returns: 576 Optional[str]: Transcription, or None on failure. 577 """ 578 def convert_to_wav(file_path: str) -> str: 579 _, ext = os.path.splitext(file_path) 580 ext = ext.lstrip('.') 581 audio = AudioSegment.from_file(file_path, format=ext) 582 tmp_filename = f"audio_{uuid4().hex}.wav" 583 wav_path = os.path.join(tempfile.gettempdir(), tmp_filename) 584 audio.export(wav_path, format='wav') 585 return wav_path 586 587 recognizer = sr.Recognizer() 588 temp_wav_path = None 589 cleanup_needed = False 590 591 try: 592 _, ext = os.path.splitext(audio_file) 593 if ext.lower() not in ['.wav', '.wave']: 594 temp_wav_path = convert_to_wav(audio_file) 595 cleanup_needed = True 596 else: 597 temp_wav_path = clean_path(audio_file) 598 599 if not temp_wav_path: 600 logger.error("Invalid audio path.") 601 return None 602 603 with sr.AudioFile(temp_wav_path) as source: 604 audio = recognizer.record(source) 605 return recognizer.recognize_google(audio) 606 607 except sr.UnknownValueError: 608 logger.error("Could not understand audio.") 609 except sr.RequestError as e: 610 logger.error(f"Speech recognition error: {e}") 611 except Exception as e: 612 logger.error(f"Failed to process audio: {e}") 613 finally: 614 if cleanup_needed and temp_wav_path and os.path.exists(temp_wav_path): 615 try: 616 os.remove(temp_wav_path) 617 except Exception as e: 618 logger.error(f"Failed to delete temp WAV file {temp_wav_path}: {e}") 619 620 return None
Transcribe audio to text using Google Speech Recognition.
Args: audio_file (str): Path to the input audio file.
Returns: Optional[str]: Transcription, or None on failure.
680def text_from_pdf( 681 pdf_path: str 682) -> Optional[str]: 683 """ 684 Extract text and OCR results from a PDF using PyMuPDF. 685 686 Args: 687 pdf_path (str): Path to PDF file. 688 689 Returns: 690 Optional[str]: Combined normalized text and image OCR results. 691 """ 692 plain_text = "" 693 temp_image_paths: List[str] = [] 694 695 try: 696 doc = pymupdf.open(pdf_path) 697 for k, v in doc.metadata.items(): 698 plain_text += f"{k}: {v}\n" 699 700 for i in range(len(doc)): 701 page = doc.load_page(i) 702 plain_text += f"\n--- Page {i + 1} ---\n" 703 text = page.get_text() 704 plain_text += text or "[No text]\n" 705 706 for img_index, img in enumerate(page.get_images(full=True), start=1): 707 xref = img[0] 708 base = doc.extract_image(xref) 709 img_bytes = base["image"] 710 711 img_filename = f"pdf_page{i+1}_img{img_index}_{uuid4().hex}.png" 712 img_path = os.path.join(tempfile.gettempdir(), img_filename) 713 temp_image_paths.append(img_path) 714 715 with open(img_path, "wb") as f: 716 f.write(img_bytes) 717 718 ocr = text_from_image(img_path) or "" 719 plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n" 720 721 # Extract tables from PDF 722 """ 723 try: 724 tables = extract_tables_from_pdf(pdf_path) 725 if tables: 726 plain_text += "\n--- Tables ---\n" 727 for i, table in enumerate(tables, 1): 728 plain_text += f"\n[Table {i}]\n" 729 if isinstance(table, dict) and "data" in table: 730 for row in table["data"]: 731 plain_text += str(row) + "\n" 732 else: 733 plain_text += str(table) + "\n" 734 except Exception as e: 735 logger.warning(f"Could not extract tables from PDF: {e}") 736 """ 737 738 return normalize_text(plain_text) 739 except Exception as e: 740 logger.error(f"Error processing PDF: {e}") 741 return None 742 finally: 743 for path in temp_image_paths: 744 if os.path.exists(path): 745 try: 746 os.remove(path) 747 except Exception as e: 748 logger.error(f"Failed to delete temp image {path}: {e}") 749 if 'doc' in locals(): 750 doc.close()
Extract text and OCR results from a PDF using PyMuPDF.
Args: pdf_path (str): Path to PDF file.
Returns: Optional[str]: Combined normalized text and image OCR results.
828def text_from_doc( 829 filepath: str, 830 min_length: int = 4 831) -> str: 832 """ 833 Extract readable strings and metadata from binary Word (.doc) files. 834 835 Args: 836 filepath (str): Path to .doc file. 837 min_length (int): Minimum string length to extract. 838 839 Returns: 840 str: Metadata and text content. 841 """ 842 def extract_printable_strings( 843 data: bytes 844 ) -> List[str]: 845 pattern = re.compile( 846 b'[' + re.escape(bytes(string.printable, 'ascii')) + 847 b']{%d,}' % min_length 848 ) 849 found = pattern.findall(data) 850 851 results = [] 852 for m in found: 853 value = m.decode(errors='ignore').strip() 854 results.append(value) 855 856 return results 857 858 def clean_strings( 859 strs: List[str] 860 ) -> List[str]: 861 cleaned: List[str] = [] 862 skip = ["HYPERLINK", "OLE2", "Normal.dotm"] 863 for line in strs: 864 if any(line.startswith(pref) for pref in skip): 865 continue 866 cleaned.append(re.sub(r'\s+', ' ', line).strip()) 867 return cleaned 868 869 with open(filepath, 'rb') as f: 870 data = f.read() 871 872 strings = extract_printable_strings(data) 873 strings = clean_strings(strings) 874 content = "\n".join(strings) 875 876 return normalize_text(content)
Extract readable strings and metadata from binary Word (.doc) files.
Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.
Returns: str: Metadata and text content.
879def text_from_docx( 880 file_path: str 881) -> Optional[str]: 882 """ 883 Extract text, tables, and OCR from embedded images in a DOCX file. 884 885 Args: 886 file_path (str): Path to the .docx file. 887 888 Returns: 889 Optional[str]: Normalized full text content. 890 """ 891 path = clean_path(file_path) 892 if not path: 893 return None 894 895 temp_image_paths: List[str] = [] 896 plain_text = "" 897 898 try: 899 doc = Document(path) 900 901 for p in doc.paragraphs: 902 if p.text.strip(): 903 plain_text += p.text.strip() + "\n" 904 905 for tbl in doc.tables: 906 plain_text += "\n[Table]\n" 907 for row in tbl.rows: 908 row_text = "\t".join(c.text.strip() for c in row.cells) 909 plain_text += row_text + "\n" 910 911 for rel_id, rel in doc.part.rels.items(): 912 if "image" in rel.target_ref: 913 blob = rel.target_part.blob 914 915 img_filename = f"docx_img_{rel_id}_{uuid4().hex}.png" 916 img_path = os.path.join(tempfile.gettempdir(), img_filename) 917 temp_image_paths.append(img_path) 918 919 with open(img_path, "wb") as img_file: 920 img_file.write(blob) 921 922 ocr = text_from_image(img_path) or "" 923 plain_text += f"\n[Image OCR]\n{ocr}\n" 924 925 return normalize_text(plain_text) 926 927 except Exception as e: 928 logger.error(f"Error processing DOCX: {e}") 929 return None 930 finally: 931 for path in temp_image_paths: 932 if os.path.exists(path): 933 try: 934 os.remove(path) 935 except Exception as e: 936 logger.error(f"Failed to delete temp DOCX image {path}: {e}")
Extract text, tables, and OCR from embedded images in a DOCX file.
Args: file_path (str): Path to the .docx file.
Returns: Optional[str]: Normalized full text content.
939def text_from_excel( 940 file_path: str 941) -> str: 942 """ 943 Convert an Excel workbook to CSV text. 944 945 Args: 946 file_path (str): Path to the Excel file. 947 948 Returns: 949 str: CSV-formatted string. 950 """ 951 path = clean_path(file_path) 952 if not path: 953 return "" 954 try: 955 # Get all sheets 956 result = "" 957 excel_file = pd.ExcelFile(path) 958 for sheet_name in excel_file.sheet_names: 959 df = pd.read_excel(path, sheet_name=sheet_name) 960 out = StringIO() 961 df.to_csv(out, index=False) 962 result += f"\n--- Sheet: {sheet_name} ---\n" 963 result += out.getvalue() 964 result += "\n" 965 return result 966 except Exception as e: 967 logger.error(f"Failed Excel -> CSV: {e}") 968 return ""
Convert an Excel workbook to CSV text.
Args: file_path (str): Path to the Excel file.
Returns: str: CSV-formatted string.
971def text_from_image( 972 file_path: str 973) -> Optional[str]: 974 """ 975 Perform OCR on an image file. 976 977 Args: 978 file_path (str): Path to the image. 979 980 Returns: 981 Optional[str]: Extracted text, or None on error. 982 """ 983 path = clean_path(file_path) 984 if not path: 985 return None 986 try: 987 with Image.open(path) as img: 988 # Improve OCR with preprocessing 989 # 1. Convert to grayscale if it's not already 990 if img.mode != 'L': 991 img = img.convert('L') 992 993 # 2. Optional: Apply some contrast enhancement 994 # (Disabled by default, enable if needed for specific cases) 995 # from PIL import ImageEnhance 996 # enhancer = ImageEnhance.Contrast(img) 997 # img = enhancer.enhance(1.5) # Increase contrast 998 999 # Perform OCR with custom configuration 1000 custom_config = r'--oem 3 --psm 6' # Default OCR Engine Mode and Page Segmentation Mode 1001 txt = pytesseract.image_to_string(img, config=custom_config).strip() 1002 return normalize_text(txt) or "" 1003 except Exception as e: 1004 logger.error(f"Failed image OCR: {e}") 1005 return None
Perform OCR on an image file.
Args: file_path (str): Path to the image.
Returns: Optional[str]: Extracted text, or None on error.
1064def text_from_any( 1065 file_path: str 1066) -> Optional[str]: 1067 """ 1068 Handle unknown file types by reporting stats and metadata. 1069 1070 Args: 1071 file_path (str): Path to the file. 1072 1073 Returns: 1074 Optional[str]: Plain-text report, or None on error. 1075 """ 1076 content = "" 1077 path = clean_path(file_path) 1078 if not path: 1079 return None 1080 try: 1081 stats = os.stat(path) 1082 info = { 1083 "path": path, 1084 "size": stats.st_size, 1085 "created": datetime.fromtimestamp(stats.st_ctime).isoformat(), 1086 "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(), 1087 } 1088 1089 for k, v in info.items(): 1090 content += "File System Data:\n" 1091 content += f"{k}: {v}\n" 1092 1093 # Try to extract EXIF if available 1094 exif = extract_exif(path) 1095 if exif: 1096 info["exif"] = exif 1097 content += "\n\nEXIF Data:\n" 1098 for k, v in exif.items(): 1099 if isinstance(v, dict): 1100 content += f"\n{k}:\n" 1101 for sub_k, sub_v in v.items(): 1102 content += f" {sub_k}: {sub_v}\n" 1103 else: 1104 content += f"{k}: {v}\n" 1105 1106 # Get file hash 1107 md5_hash = hashlib.md5(open(path,'rb').read()).hexdigest() 1108 info["md5"] = md5_hash 1109 1110 # Get strings 1111 strings = extract_strings(path) 1112 if strings: 1113 info["strings"] = strings 1114 content += "\n\nStrings Data:\n" 1115 clean_strings = "\n".join(strings) 1116 content += clean_strings 1117 1118 return text_from_object(info) 1119 except Exception as e: 1120 logger.error(f"Error on other file: {e}") 1121 return None
Handle unknown file types by reporting stats and metadata.
Args: file_path (str): Path to the file.
Returns: Optional[str]: Plain-text report, or None on error.
1541def text_from_odt(odt_path: str) -> Optional[str]: 1542 """ 1543 Extract text from OpenDocument Text files. 1544 1545 Args: 1546 odt_path (str): Path to the ODT file 1547 1548 Returns: 1549 Optional[str]: Extracted text 1550 """ 1551 try: 1552 from odf import text, teletype 1553 from odf.opendocument import load 1554 1555 textdoc = load(odt_path) 1556 1557 # Extract metadata 1558 meta = [] 1559 meta_elem = textdoc.meta 1560 if meta_elem: 1561 for prop in meta_elem.childNodes: 1562 if hasattr(prop, 'tagName') and hasattr(prop, 'childNodes') and prop.childNodes: 1563 meta.append(f"{prop.tagName}: {teletype.extractText(prop)}") 1564 1565 # Extract content 1566 allparas = textdoc.getElementsByType(text.P) 1567 content = "\n".join(teletype.extractText(p) for p in allparas) 1568 1569 # Combine metadata and content 1570 if meta: 1571 final_text = "\n".join(meta) + "\n---\n" + content 1572 else: 1573 final_text = content 1574 1575 return normalize_text(final_text) 1576 except ImportError: 1577 logger.error("odfpy not installed") 1578 return "odfpy package is required for ODT processing" 1579 except Exception as e: 1580 logger.error(f"Error processing ODT: {e}") 1581 return None
Extract text from OpenDocument Text files.
Args: odt_path (str): Path to the ODT file
Returns: Optional[str]: Extracted text
1498def text_from_pptx(pptx_path: str) -> Optional[str]: 1499 """ 1500 Extract text from PowerPoint presentations. 1501 1502 Args: 1503 pptx_path (str): Path to the PowerPoint file 1504 1505 Returns: 1506 Optional[str]: Extracted text 1507 """ 1508 try: 1509 from pptx import Presentation 1510 1511 prs = Presentation(pptx_path) 1512 text = ["--- PowerPoint Presentation ---"] 1513 1514 for i, slide in enumerate(prs.slides, 1): 1515 slide_text = [f"Slide {i}:"] 1516 1517 # Get slide title if it exists 1518 if slide.shapes.title and slide.shapes.title.text: 1519 slide_text.append(f"Title: {slide.shapes.title.text}") 1520 1521 # Extract text from all shapes 1522 shape_text = [] 1523 for shape in slide.shapes: 1524 if hasattr(shape, "text") and shape.text: 1525 shape_text.append(shape.text) 1526 1527 if shape_text: 1528 slide_text.append("\n".join(shape_text)) 1529 1530 text.append("\n".join(slide_text)) 1531 1532 return normalize_text("\n\n".join(text)) 1533 except ImportError: 1534 logger.error("python-pptx not installed") 1535 return "python-pptx package is required for PowerPoint processing" 1536 except Exception as e: 1537 logger.error(f"Error processing PowerPoint: {e}") 1538 return None
Extract text from PowerPoint presentations.
Args: pptx_path (str): Path to the PowerPoint file
Returns: Optional[str]: Extracted text
1451def text_from_epub(epub_path: str) -> Optional[str]: 1452 """ 1453 Extract text from EPUB ebooks. 1454 1455 Args: 1456 epub_path (str): Path to the EPUB file 1457 1458 Returns: 1459 Optional[str]: Extracted text 1460 """ 1461 try: 1462 from ebooklib import epub 1463 import html2text 1464 1465 book = epub.read_epub(epub_path) 1466 h = html2text.HTML2Text() 1467 h.ignore_links = False 1468 1469 content = [] 1470 1471 # Get book metadata 1472 metadata = [] 1473 if book.get_metadata('DC', 'title'): 1474 metadata.append(f"Title: {book.get_metadata('DC', 'title')[0][0]}") 1475 if book.get_metadata('DC', 'creator'): 1476 metadata.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}") 1477 if book.get_metadata('DC', 'description'): 1478 metadata.append(f"Description: {book.get_metadata('DC', 'description')[0][0]}") 1479 1480 if metadata: 1481 content.append("\n".join(metadata)) 1482 content.append("---") 1483 1484 # Get book content 1485 for item in book.get_items(): 1486 if item.get_type() == epub.ITEM_DOCUMENT: 1487 content.append(h.handle(item.get_content().decode('utf-8'))) 1488 1489 return normalize_text("\n".join(content)) 1490 except ImportError: 1491 logger.error("ebooklib and/or html2text not installed") 1492 return "ebooklib and/or html2text packages are required for EPUB processing" 1493 except Exception as e: 1494 logger.error(f"Error processing EPUB: {e}") 1495 return None
Extract text from EPUB ebooks.
Args: epub_path (str): Path to the EPUB file
Returns: Optional[str]: Extracted text
1409def analyze_text(text: str) -> Dict[str, Any]: 1410 """ 1411 Perform basic text analytics. 1412 1413 Args: 1414 text (str): Input text 1415 1416 Returns: 1417 Dict: Analysis results 1418 """ 1419 try: 1420 # Tokenize text 1421 words = nltk.word_tokenize(text.lower()) 1422 sentences = nltk.sent_tokenize(text) 1423 1424 # Filter out punctuation 1425 words = [word for word in words if word.isalpha()] 1426 1427 # Count word frequencies 1428 word_freq = Counter(words) 1429 1430 # Calculate readability metrics 1431 avg_word_length = sum(len(word) for word in words) / len(words) if words else 0 1432 avg_sent_length = len(words) / len(sentences) if sentences else 0 1433 1434 # Detect language 1435 language = detect_language(text) 1436 1437 return { 1438 "word_count": len(words), 1439 "sentence_count": len(sentences), 1440 "unique_words": len(set(words)), 1441 "avg_word_length": avg_word_length, 1442 "avg_sentence_length": avg_sent_length, 1443 "language": language, 1444 "most_common_words": word_freq.most_common(20) 1445 } 1446 except Exception as e: 1447 logger.error(f"Text analysis error: {e}") 1448 return {"error": str(e)}
Perform basic text analytics.
Args: text (str): Input text
Returns: Dict: Analysis results
1341def summarize_text(text: str, sentences: int = 5) -> str: 1342 """ 1343 Create a simple extractive summary from the text. 1344 1345 Args: 1346 text (str): Input text to summarize 1347 sentences (int): Number of sentences to include 1348 1349 Returns: 1350 str: Summarized text 1351 """ 1352 try: 1353 from nltk.corpus import stopwords 1354 from nltk.tokenize import sent_tokenize 1355 1356 # Download required NLTK data if not already present 1357 try: 1358 nltk.data.find('tokenizers/punkt') 1359 except LookupError: 1360 nltk.download('punkt', quiet=True) 1361 try: 1362 nltk.data.find('corpora/stopwords') 1363 except LookupError: 1364 nltk.download('stopwords', quiet=True) 1365 1366 # Tokenize and calculate word frequencies 1367 stop_words = set(stopwords.words('english')) 1368 sentences_list = sent_tokenize(text) 1369 1370 # If there are fewer sentences than requested, return all 1371 if len(sentences_list) <= sentences: 1372 return text 1373 1374 word_frequencies = {} 1375 for sentence in sentences_list: 1376 for word in nltk.word_tokenize(sentence): 1377 word = word.lower() 1378 if word not in stop_words: 1379 if word not in word_frequencies: 1380 word_frequencies[word] = 1 1381 else: 1382 word_frequencies[word] += 1 1383 1384 # Normalize frequencies 1385 maximum_frequency = max(word_frequencies.values()) if word_frequencies else 1 1386 for word in word_frequencies: 1387 word_frequencies[word] = word_frequencies[word] / maximum_frequency 1388 1389 # Score sentences 1390 sentence_scores = {} 1391 for i, sentence in enumerate(sentences_list): 1392 for word in nltk.word_tokenize(sentence.lower()): 1393 if word in word_frequencies: 1394 if i not in sentence_scores: 1395 sentence_scores[i] = word_frequencies[word] 1396 else: 1397 sentence_scores[i] += word_frequencies[word] 1398 1399 # Get top N sentences 1400 summary_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:sentences] 1401 summary_sentences = [sentences_list[i] for i, _ in sorted(summary_sentences)] 1402 1403 return ' '.join(summary_sentences) 1404 except Exception as e: 1405 logger.error(f"Summarization error: {e}") 1406 return text
Create a simple extractive summary from the text.
Args: text (str): Input text to summarize sentences (int): Number of sentences to include
Returns: str: Summarized text
1287def translate_text(text: str, target_lang: str = "en") -> Optional[str]: 1288 """ 1289 Translate text to target language. 1290 1291 Args: 1292 text (str): Input text to translate 1293 target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese) 1294 1295 Returns: 1296 Optional[str]: Translated text or None on failure 1297 """ 1298 try: 1299 # Use a more stable translation library 1300 # Note: googletrans 4.0.0-rc1 uses async methods which need to be awaited 1301 # Let's use the deep-translator library instead which is more stable 1302 from deep_translator import GoogleTranslator 1303 1304 # Handle long texts by splitting into chunks (Google has a limit) 1305 max_chunk_size = 4500 # Google Translate has a limit around 5000 chars 1306 chunks = [] 1307 1308 # Split text into chunks of appropriate size (at sentence boundaries if possible) 1309 text_remaining = text 1310 while len(text_remaining) > 0: 1311 if len(text_remaining) <= max_chunk_size: 1312 chunks.append(text_remaining) 1313 break 1314 1315 # Try to find a sentence boundary near the max chunk size 1316 chunk_end = max_chunk_size 1317 while chunk_end > 0 and text_remaining[chunk_end] not in ['.', '!', '?', '\n']: 1318 chunk_end -= 1 1319 1320 # If no good sentence boundary found, just use max size 1321 if chunk_end == 0: 1322 chunk_end = max_chunk_size 1323 else: 1324 chunk_end += 1 # Include the period or boundary character 1325 1326 chunks.append(text_remaining[:chunk_end]) 1327 text_remaining = text_remaining[chunk_end:] 1328 1329 # Translate each chunk and combine 1330 translated_chunks = [] 1331 for chunk in chunks: 1332 translated_chunk = GoogleTranslator(source='auto', target=target_lang).translate(chunk) 1333 translated_chunks.append(translated_chunk) 1334 1335 return ' '.join(translated_chunks) 1336 except Exception as e: 1337 logger.error(f"Translation error: {e}") 1338 return None
Translate text to target language.
Args: text (str): Input text to translate target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese)
Returns: Optional[str]: Translated text or None on failure
1259def list_available_languages() -> Dict[str, str]: 1260 """ 1261 Get a dictionary of available languages for translation. 1262 1263 Returns: 1264 Dict[str, str]: Dictionary mapping language codes to language names 1265 """ 1266 try: 1267 from deep_translator import GoogleTranslator 1268 # Get available languages from the translator 1269 languages = GoogleTranslator().get_supported_languages(as_dict=True) 1270 return languages 1271 except Exception as e: 1272 logger.error(f"Error getting language list: {e}") 1273 # Return a small subset as fallback 1274 return { 1275 "en": "English", 1276 "es": "Spanish", 1277 "fr": "French", 1278 "de": "German", 1279 "it": "Italian", 1280 "ja": "Japanese", 1281 "ko": "Korean", 1282 "zh-cn": "Chinese (Simplified)", 1283 "ru": "Russian", 1284 "ar": "Arabic" 1285 }
Get a dictionary of available languages for translation.
Returns: Dict[str, str]: Dictionary mapping language codes to language names
1242def detect_language(text: str) -> str: 1243 """ 1244 Detect the language of the extracted text. 1245 1246 Args: 1247 text (str): Input text 1248 1249 Returns: 1250 str: Detected language code or 'unknown' 1251 """ 1252 try: 1253 import langdetect 1254 return langdetect.detect(text) 1255 except: 1256 logger.warning("Language detection failed or langdetect not installed") 1257 return "unknown"
Detect the language of the extracted text.
Args: text (str): Input text
Returns: str: Detected language code or 'unknown'
366def scrape_website(url: str, max_pages: int = 1, stay_on_domain: bool = True) -> Dict[str, str]: 367 """ 368 Scrape multiple pages of a website. 369 370 Args: 371 url (str): Starting URL 372 max_pages (int): Maximum pages to scrape 373 stay_on_domain (bool): Whether to stay on the same domain 374 375 Returns: 376 Dict[str, str]: Dictionary mapping URLs to extracted text 377 """ 378 results = {} 379 visited = set() 380 to_visit = [url] 381 base_domain = urlparse(url).netloc 382 383 while to_visit and len(visited) < max_pages: 384 current_url = to_visit.pop(0) 385 if current_url in visited: 386 continue 387 388 # Extract text from current page 389 text = text_from_url(current_url) 390 if text: 391 results[current_url] = text 392 393 visited.add(current_url) 394 395 # Find links on the page 396 session = HTMLSession() 397 try: 398 r = session.get(current_url) 399 r.html.render(timeout=20, sleep=1) 400 401 links = r.html.absolute_links 402 for link in links: 403 link_domain = urlparse(link).netloc 404 if link not in visited and link not in to_visit: 405 # Check if we should follow this link 406 if stay_on_domain and link_domain != base_domain: 407 continue 408 to_visit.append(link) 409 except Exception as e: 410 logger.error(f"Error scraping {current_url}: {e}") 411 finally: 412 session.close() 413 414 return results
Scrape multiple pages of a website.
Args: url (str): Starting URL max_pages (int): Maximum pages to scrape stay_on_domain (bool): Whether to stay on the same domain
Returns: Dict[str, str]: Dictionary mapping URLs to extracted text
125def normalize_text( 126 text: str 127) -> str: 128 """ 129 Replace multiple consecutive newlines, carriage returns, and spaces 130 with a single space. Ensures compact, single-line output. 131 132 Args: 133 text (str): Raw input text. 134 135 Returns: 136 str: Normalized single-line text. 137 """ 138 if not text: 139 return "" 140 text = unicodedata.normalize("NFKC", text) 141 text = re.sub(r' +', ' ', text) 142 text = re.sub(r'\n+', '\n', text) 143 text = re.sub(r'(?m)(^ \n)+', '\n', text) 144 text = re.sub(r'\t+', '\t', text) 145 text = re.sub(r'\r+', '\n', text) 146 text = re.sub(r"^ ", "", text, flags=re.MULTILINE) 147 return text
Replace multiple consecutive newlines, carriage returns, and spaces with a single space. Ensures compact, single-line output.
Args: text (str): Raw input text.
Returns: str: Normalized single-line text.