mrblack
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# 4# File: __init__.py 5# Author: Wadih Khairallah 6# Description: 7# Created: 2025-05-12 16:47:22 8# Modified: 2025-05-15 03:16:18 9 10from .pii import ( 11 extract_pii_text, 12 extract_pii_file, 13 extract_pii_url, 14 extract_pii_image, 15 extract_pii_screenshot 16) 17from .textextract import ( 18 extract_text, 19 extract_exif, 20 extract_metadata, 21 text_from_screenshot, 22 text_from_url, 23 text_from_html, 24 text_from_audio, 25 text_from_pdf, 26 text_from_doc, 27 text_from_docx, 28 text_from_excel, 29 text_from_image, 30 text_from_any 31) 32 33__all__ = [ 34 "extract_pii_text", 35 "extract_pii_file", 36 "extract_pii_url", 37 "extract_pii_image", 38 "extract_pii_screenshot", 39 "extract_text", 40 "extract_exif", 41 "extract_metadata", 42 "text_from_url", 43 "text_from_html", 44 "text_from_audio", 45 "text_from_pdf", 46 "text_from_doc", 47 "text_from_docx", 48 "text_from_excel", 49 "text_from_image", 50 "text_from_any" 51]
71def extract_pii_text( 72 text: str, 73 labels: Optional[Union[List[str], str]] = None 74) -> Dict[str, List[str]]: 75 """ 76 Extract PII matches from provided text. 77 78 Args: 79 text (str): The input text to scan for patterns. 80 labels (Optional[Union[List[str], str]]): Specific labels to filter on. 81 82 Returns: 83 Dict[str, List[str]]: Mapping of each label to a sorted list of 84 matched and cleaned strings. 85 """ 86 if isinstance(labels, str): 87 labels = [labels] 88 patterns = PATTERNS 89 if labels: 90 patterns = [ 91 p for p in PATTERNS 92 if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels) 93 ] 94 results: Dict[str, set] = defaultdict(set) 95 for pattern in patterns: 96 try: 97 rx = re.compile(pattern) 98 for m in rx.finditer(text): 99 for lbl, val in m.groupdict().items(): 100 if not val: 101 continue 102 cleaned = _clean_value(lbl, val) 103 if lbl == "url": 104 cleaned = cleaned.rstrip("),.**") 105 if cleaned is not None: 106 results[lbl].add(cleaned) 107 except re.error as e: 108 print( 109 f"Invalid regex skipped: {pattern} → {e}", 110 file=sys.stderr 111 ) 112 return {lbl: sorted(vals) for lbl, vals in results.items()}
Extract PII matches from provided text.
Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.
Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.
115def extract_pii_file( 116 file_path: str, 117 labels: Optional[Union[List[str], str]] = None 118) -> Optional[Dict[str, List[str]]]: 119 """ 120 Extract PII from a single file's text content. 121 122 Args: 123 file_path (str): Path to the file. 124 labels (Optional[Union[List[str], str]]): Labels to filter. 125 126 Returns: 127 Optional[Dict[str, List[str]]]: Extraction results, or None. 128 """ 129 text = extract_text(file_path) 130 if not text: 131 return None 132 data = extract_pii_text(text, labels) 133 return data or None
Extract PII from a single file's text content.
Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
136def extract_pii_url( 137 path: str, 138 labels: Optional[Union[List[str], str]] = None 139) -> Optional[Dict[str, List[str]]]: 140 """ 141 Extract PII from the text at a URL. 142 143 Args: 144 path (str): The URL to fetch. 145 labels (Optional[Union[List[str], str]]): Labels to filter. 146 147 Returns: 148 Optional[Dict[str, List[str]]]: Extraction results, or None. 149 """ 150 text = text_from_url(path) 151 if not text: 152 return None 153 data = extract_pii_text(text, labels) 154 return data or None
Extract PII from the text at a URL.
Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
157def extract_pii_image( 158 image_path: str, 159 labels: Optional[Union[List[str], str]] = None 160) -> Optional[Dict[str, List[str]]]: 161 """ 162 Extract PII from an image using OCR. 163 164 Args: 165 image_path (str): Path to the image file. 166 labels (Optional[Union[List[str], str]]): Labels to filter. 167 168 Returns: 169 Optional[Dict[str, List[str]]]: Extraction results, or None. 170 """ 171 path = clean_path(image_path) 172 if not path or not os.path.isfile(path): 173 print(f"[red]Invalid image path:[/] {image_path}") 174 return None 175 text = extract_text(path) 176 if not text: 177 return None 178 data = extract_pii_text(text, labels) 179 return data or None
Extract PII from an image using OCR.
Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
182def extract_pii_screenshot( 183 labels: Optional[Union[List[str], str]] = None 184) -> Optional[Dict[str, List[str]]]: 185 """ 186 Capture a screenshot and extract PII from its OCR text. 187 188 Args: 189 labels (Optional[Union[List[str], str]]): Labels to filter. 190 191 Returns: 192 Optional[Dict[str, List[str]]]: Extraction results, or None. 193 """ 194 text = text_from_screenshot() 195 if not text: 196 return None 197 data = extract_pii_text(text, labels) 198 return data or None
Capture a screenshot and extract PII from its OCR text.
Args: labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
249def extract_text( 250 file_path: str 251) -> Optional[str]: 252 """ 253 Extract text content from a local file or URL. 254 255 Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio. 256 257 Args: 258 file_path (str): Path to the input file or URL. 259 260 Returns: 261 Optional[str]: Extracted text, or None if unsupported or error. 262 """ 263 if is_url(file_path): 264 return text_from_url(file_path) 265 266 TEXT_MIME_TYPES = { 267 "application/json", "application/xml", "application/x-yaml", 268 "application/x-toml", "application/x-csv", "application/x-markdown", 269 } 270 271 path = clean_path(file_path) 272 if not path: 273 print(f"No such file: {file_path}") 274 return None 275 276 mime_type = magic.from_file(path, mime=True) 277 try: 278 if mime_type.startswith("text/html"): 279 content = text_from_html(path) 280 281 if mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES: 282 with open(path, 'r', encoding='utf-8', errors='ignore') as f: 283 content = f.read() 284 285 elif mime_type in [ 286 "application/vnd.ms-excel", 287 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 288 ]: 289 content = text_from_excel(path) 290 291 elif mime_type == "application/pdf": 292 content = text_from_pdf(path) 293 294 elif mime_type == \ 295 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 296 content = text_from_docx(path) 297 298 elif mime_type == "application/msword": 299 content = text_from_doc(path) 300 301 elif mime_type.startswith("image/"): 302 content = text_from_image(path) 303 304 elif mime_type.startswith("audio/"): 305 content = text_from_audio(path) 306 307 else: 308 content = text_from_any(path) 309 310 if content: 311 return content 312 else: 313 print(f"No content found for file: {path}") 314 return None 315 except Exception as e: 316 print(f"Error reading {path}: {e}") 317 return None
Extract text content from a local file or URL.
Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
Args: file_path (str): Path to the input file or URL.
Returns: Optional[str]: Extracted text, or None if unsupported or error.
155def extract_exif( 156 file_path: str 157) -> Optional[Dict[str, Any]]: 158 """ 159 Extract EXIF metadata from a file using exiftool. 160 161 Args: 162 file_path (str): Path to the target file. 163 164 Returns: 165 Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure. 166 """ 167 exif_data: Optional[Dict[str, Any]] = None 168 try: 169 result = subprocess.run( 170 ['exiftool', '-j', file_path], 171 stdout=subprocess.PIPE, 172 stderr=subprocess.PIPE 173 ) 174 if result.returncode == 0: 175 exif_data = json.loads(result.stdout.decode())[0] 176 except Exception as e: 177 print(f"Exiftool failed: {e}") 178 return exif_data
Extract EXIF metadata from a file using exiftool.
Args: file_path (str): Path to the target file.
Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
658def extract_metadata( 659 file_path: str 660) -> Dict[str, Any]: 661 """ 662 Extract comprehensive metadata from any file type. 663 664 Args: 665 file_path (str): Path to target file. 666 667 Returns: 668 Dict[str, Any]: Nested metadata structure. 669 """ 670 path = clean_path(file_path) 671 if not path: 672 return {"error": "File not found"} 673 meta: Dict[str, Any] = {} 674 try: 675 stats = os.stat(path) 676 meta["size_bytes"] = stats.st_size 677 meta["mime"] = magic.from_file(path, mime=True) 678 meta["hashes"] = { 679 "md5": hashlib.md5(open(path,'rb').read()).hexdigest()} 680 except Exception as e: 681 meta["error"] = str(e) 682 return meta
Extract comprehensive metadata from any file type.
Args: file_path (str): Path to target file.
Returns: Dict[str, Any]: Nested metadata structure.
209def text_from_url( 210 url: str, 211 render_js: bool = True 212) -> Optional[str]: 213 """ 214 Fetch and extract all visible text from a web page, including JS-rendered content. 215 216 Args: 217 url (str): Target webpage URL. 218 render_js (bool): Whether to render JavaScript content. 219 220 Returns: 221 Optional[str]: Cleaned full-page text, or None on failure. 222 """ 223 headers = { 224 "User-Agent": random.choice(USER_AGENTS), 225 "Accept-Language": "en-US,en;q=0.9", 226 "Referer": url, 227 "DNT": "1", 228 "Upgrade-Insecure-Requests": "1" 229 } 230 231 session = HTMLSession() 232 try: 233 r = session.get(url, headers=headers, timeout=20) 234 if render_js: 235 r.html.render(timeout=20, sleep=1) 236 237 html = r.html.html 238 content = text_from_html(html) 239 240 return content 241 242 except Exception as e: 243 print(f"[Error] {url} - {e}") 244 return None 245 finally: 246 session.close()
Fetch and extract all visible text from a web page, including JS-rendered content.
Args: url (str): Target webpage URL. render_js (bool): Whether to render JavaScript content.
Returns: Optional[str]: Cleaned full-page text, or None on failure.
181def text_from_html(html: str) -> str: 182 """ 183 Extract readable text from raw HTML content. 184 185 Args: 186 html (str): HTML source as a string. 187 188 Returns: 189 str: Cleaned and normalized visible text. 190 """ 191 soup = BeautifulSoup(html, "html.parser") 192 193 # Remove non-visible or structural elements 194 for tag in soup([ 195 "script", "style", 196 "noscript", "iframe", 197 "meta", "link", 198 "header", "footer", 199 "form", "nav", 200 "aside" 201 ]): 202 tag.decompose() 203 204 text = soup.get_text(separator=" ") 205 206 return normalize(text)
Extract readable text from raw HTML content.
Args: html (str): HTML source as a string.
Returns: str: Cleaned and normalized visible text.
319def text_from_audio( 320 audio_file: str 321) -> Optional[str]: 322 """ 323 Transcribe audio to text using Google Speech Recognition. 324 325 Args: 326 audio_file (str): Path to the input audio file. 327 328 Returns: 329 Optional[str]: Transcription, or None on failure. 330 """ 331 def convert_to_wav(file_path: str) -> str: 332 _, ext = os.path.splitext(file_path) 333 ext = ext.lstrip('.') 334 audio = AudioSegment.from_file(file_path, format=ext) 335 tmp_filename = f"audio_{uuid4().hex}.wav" 336 wav_path = os.path.join(tempfile.gettempdir(), tmp_filename) 337 audio.export(wav_path, format='wav') 338 return wav_path 339 340 recognizer = sr.Recognizer() 341 temp_wav_path = None 342 cleanup_needed = False 343 344 try: 345 _, ext = os.path.splitext(audio_file) 346 if ext.lower() not in ['.wav', '.wave']: 347 temp_wav_path = convert_to_wav(audio_file) 348 cleanup_needed = True 349 else: 350 temp_wav_path = clean_path(audio_file) 351 352 if not temp_wav_path: 353 print("Invalid audio path.") 354 return None 355 356 with sr.AudioFile(temp_wav_path) as source: 357 audio = recognizer.record(source) 358 return recognizer.recognize_google(audio) 359 360 except sr.UnknownValueError: 361 print("Could not understand audio.") 362 except sr.RequestError as e: 363 print(f"Speech recognition error: {e}") 364 except Exception as e: 365 print(f"Failed to process audio: {e}") 366 finally: 367 if cleanup_needed and temp_wav_path and os.path.exists(temp_wav_path): 368 try: 369 os.remove(temp_wav_path) 370 except Exception as e: 371 print(f"Failed to delete temp WAV file {temp_wav_path}: {e}") 372 373 return None
Transcribe audio to text using Google Speech Recognition.
Args: audio_file (str): Path to the input audio file.
Returns: Optional[str]: Transcription, or None on failure.
420def text_from_pdf( 421 pdf_path: str 422) -> Optional[str]: 423 """ 424 Extract text and OCR results from a PDF using PyMuPDF. 425 426 Args: 427 pdf_path (str): Path to PDF file. 428 429 Returns: 430 Optional[str]: Combined normalized text and image OCR results. 431 """ 432 plain_text = "" 433 temp_image_paths: List[str] = [] 434 435 try: 436 doc = pymupdf.open(pdf_path) 437 for k, v in doc.metadata.items(): 438 plain_text += f"{k}: {v}\n" 439 440 for i in range(len(doc)): 441 page = doc.load_page(i) 442 plain_text += f"\n--- Page {i + 1} ---\n" 443 text = page.get_text() 444 plain_text += text or "[No text]\n" 445 446 for img_index, img in enumerate(page.get_images(full=True), start=1): 447 xref = img[0] 448 base = doc.extract_image(xref) 449 img_bytes = base["image"] 450 451 img_filename = f"pdf_page{i+1}_img{img_index}_{uuid4().hex}.png" 452 img_path = os.path.join(tempfile.gettempdir(), img_filename) 453 temp_image_paths.append(img_path) 454 455 with open(img_path, "wb") as f: 456 f.write(img_bytes) 457 458 ocr = text_from_image(img_path) or "" 459 plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n" 460 461 return normalize(plain_text) 462 except Exception as e: 463 print(f"Error processing PDF: {e}") 464 return None 465 finally: 466 for path in temp_image_paths: 467 if os.path.exists(path): 468 try: 469 os.remove(path) 470 except Exception as e: 471 print(f"Failed to delete temp image {path}: {e}") 472 doc.close()
Extract text and OCR results from a PDF using PyMuPDF.
Args: pdf_path (str): Path to PDF file.
Returns: Optional[str]: Combined normalized text and image OCR results.
475def text_from_doc( 476 filepath: str, 477 min_length: int = 4 478) -> str: 479 """ 480 Extract readable strings and metadata from binary Word (.doc) files. 481 482 Args: 483 filepath (str): Path to .doc file. 484 min_length (int): Minimum string length to extract. 485 486 Returns: 487 str: Metadata and text content. 488 """ 489 def extract_printable_strings( 490 data: bytes 491 ) -> List[str]: 492 pattern = re.compile( 493 b'[' + re.escape(bytes(string.printable, 'ascii')) + 494 b']{%d,}' % min_length 495 ) 496 found = pattern.findall(data) 497 return list(dict.fromkeys(m.decode(errors='ignore').strip() 498 for m in found)) 499 500 def clean_strings( 501 strs: List[str] 502 ) -> List[str]: 503 cleaned: List[str] = [] 504 skip = ["HYPERLINK", "OLE2", "Normal.dotm"] 505 for line in strs: 506 if any(line.startswith(pref) for pref in skip): 507 continue 508 cleaned.append(re.sub(r'\s+', ' ', line).strip()) 509 return cleaned 510 511 with open(filepath, 'rb') as f: 512 data = f.read() 513 strings = extract_printable_strings(data) 514 strings = clean_strings(strings) 515 content = "\n".join(strings) 516 return normalize(content)
Extract readable strings and metadata from binary Word (.doc) files.
Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.
Returns: str: Metadata and text content.
519def text_from_docx( 520 file_path: str 521) -> Optional[str]: 522 """ 523 Extract text, tables, and OCR from embedded images in a DOCX file. 524 525 Args: 526 file_path (str): Path to the .docx file. 527 528 Returns: 529 Optional[str]: Normalized full text content. 530 """ 531 path = clean_path(file_path) 532 if not path: 533 return None 534 535 temp_image_paths: List[str] = [] 536 plain_text = "" 537 538 try: 539 doc = Document(path) 540 541 for p in doc.paragraphs: 542 if p.text.strip(): 543 plain_text += p.text.strip() + "\n" 544 545 for tbl in doc.tables: 546 plain_text += "\n[Table]\n" 547 for row in tbl.rows: 548 row_text = "\t".join(c.text.strip() for c in row.cells) 549 plain_text += row_text + "\n" 550 551 for rel_id, rel in doc.part.rels.items(): 552 if "image" in rel.target_ref: 553 blob = rel.target_part.blob 554 555 img_filename = f"docx_img_{rel_id}_{uuid4().hex}.png" 556 img_path = os.path.join(tempfile.gettempdir(), img_filename) 557 temp_image_paths.append(img_path) 558 559 with open(img_path, "wb") as img_file: 560 img_file.write(blob) 561 562 ocr = text_from_image(img_path) or "" 563 plain_text += f"\n[Image OCR]\n{ocr}\n" 564 565 return normalize(plain_text) 566 567 except Exception as e: 568 print(f"Error processing DOCX: {e}") 569 return None 570 finally: 571 for path in temp_image_paths: 572 if os.path.exists(path): 573 try: 574 os.remove(path) 575 except Exception as e: 576 print(f"Failed to delete temp DOCX image {path}: {e}")
Extract text, tables, and OCR from embedded images in a DOCX file.
Args: file_path (str): Path to the .docx file.
Returns: Optional[str]: Normalized full text content.
579def text_from_excel( 580 file_path: str 581) -> str: 582 """ 583 Convert an Excel workbook to CSV text. 584 585 Args: 586 file_path (str): Path to the Excel file. 587 588 Returns: 589 str: CSV-formatted string. 590 """ 591 path = clean_path(file_path) 592 if not path: 593 return "" 594 try: 595 df = pd.read_excel(path) 596 out = StringIO() 597 df.to_csv(out, index=False) 598 return out.getvalue() 599 except Exception as e: 600 print(f"Failed Excel -> CSV: {e}") 601 return ""
Convert an Excel workbook to CSV text.
Args: file_path (str): Path to the Excel file.
Returns: str: CSV-formatted string.
604def text_from_image( 605 file_path: str 606) -> Optional[str]: 607 """ 608 Perform OCR on an image file. 609 610 Args: 611 file_path (str): Path to the image. 612 613 Returns: 614 Optional[str]: Extracted text, or None on error. 615 """ 616 path = clean_path(file_path) 617 if not path: 618 return None 619 try: 620 with Image.open(path) as img: 621 txt = pytesseract.image_to_string(img).strip() 622 return normalize(txt) or "" 623 except Exception as e: 624 print(f"Failed image OCR: {e}") 625 return None
Perform OCR on an image file.
Args: file_path (str): Path to the image.
Returns: Optional[str]: Extracted text, or None on error.
628def text_from_any( 629 file_path: str 630) -> Optional[str]: 631 """ 632 Handle unknown file types by reporting stats and metadata. 633 634 Args: 635 file_path (str): Path to the file. 636 637 Returns: 638 Optional[str]: Plain-text report, or None on error. 639 """ 640 path = clean_path(file_path) 641 if not path: 642 return None 643 try: 644 stats = os.stat(path) 645 info = { 646 "path": path, 647 "size": stats.st_size, 648 "created": datetime.fromtimestamp(stats.st_ctime).isoformat(), 649 "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(), 650 } 651 content = "\n".join(f"{k}: {v}" for k, v in info.items()) 652 return normalize(content) 653 except Exception as e: 654 print(f"Error on other file: {e}") 655 return None
Handle unknown file types by reporting stats and metadata.
Args: file_path (str): Path to the file.
Returns: Optional[str]: Plain-text report, or None on error.