mrblack
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# 4# File: __init__.py 5# Author: Wadih Khairallah 6# Description: 7# Created: 2025-05-12 16:47:22 8# Modified: 2025-05-14 18:49:23 9 10from .pii import ( 11 extract_pii_text, 12 extract_pii_file, 13 extract_pii_url, 14 extract_pii_image, 15 extract_pii_screenshot 16) 17from .textextract import ( 18 extract_text, 19 extract_exif, 20 extract_metadata, 21 text_from_screenshot, 22 text_from_url, 23 text_from_audio, 24 text_from_pdf, 25 text_from_doc, 26 text_from_docx, 27 text_from_excel, 28 text_from_image, 29 text_from_any 30) 31 32__all__ = [ 33 "extract_pii_text", 34 "extract_pii_file", 35 "extract_pii_url", 36 "extract_pii_image", 37 "extract_pii_screenshot", 38 "extract_text", 39 "extract_exif", 40 "extract_metadata", 41 "text_from_url", 42 "text_from_audio", 43 "text_from_pdf", 44 "text_from_doc", 45 "text_from_docx", 46 "text_from_excel", 47 "text_from_image", 48 "text_from_any" 49]
71def extract_pii_text( 72 text: str, 73 labels: Optional[Union[List[str], str]] = None 74) -> Dict[str, List[str]]: 75 """ 76 Extract PII matches from provided text. 77 78 Args: 79 text (str): The input text to scan for patterns. 80 labels (Optional[Union[List[str], str]]): Specific labels to filter on. 81 82 Returns: 83 Dict[str, List[str]]: Mapping of each label to a sorted list of 84 matched and cleaned strings. 85 """ 86 if isinstance(labels, str): 87 labels = [labels] 88 patterns = PATTERNS 89 if labels: 90 patterns = [ 91 p for p in PATTERNS 92 if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels) 93 ] 94 results: Dict[str, set] = defaultdict(set) 95 for pattern in patterns: 96 try: 97 rx = re.compile(pattern) 98 for m in rx.finditer(text): 99 for lbl, val in m.groupdict().items(): 100 if not val: 101 continue 102 cleaned = _clean_value(lbl, val) 103 if lbl == "url": 104 cleaned = cleaned.rstrip("),.**") 105 if cleaned is not None: 106 results[lbl].add(cleaned) 107 except re.error as e: 108 print( 109 f"Invalid regex skipped: {pattern} → {e}", 110 file=sys.stderr 111 ) 112 return {lbl: sorted(vals) for lbl, vals in results.items()}
Extract PII matches from provided text.
Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.
Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.
115def extract_pii_file( 116 file_path: str, 117 labels: Optional[Union[List[str], str]] = None 118) -> Optional[Dict[str, List[str]]]: 119 """ 120 Extract PII from a single file's text content. 121 122 Args: 123 file_path (str): Path to the file. 124 labels (Optional[Union[List[str], str]]): Labels to filter. 125 126 Returns: 127 Optional[Dict[str, List[str]]]: Extraction results, or None. 128 """ 129 text = extract_text(file_path) 130 if not text: 131 return None 132 data = extract_pii_text(text, labels) 133 return data or None
Extract PII from a single file's text content.
Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
136def extract_pii_url( 137 path: str, 138 labels: Optional[Union[List[str], str]] = None 139) -> Optional[Dict[str, List[str]]]: 140 """ 141 Extract PII from the text at a URL. 142 143 Args: 144 path (str): The URL to fetch. 145 labels (Optional[Union[List[str], str]]): Labels to filter. 146 147 Returns: 148 Optional[Dict[str, List[str]]]: Extraction results, or None. 149 """ 150 text = text_from_url(path) 151 if not text: 152 return None 153 data = extract_pii_text(text, labels) 154 return data or None
Extract PII from the text at a URL.
Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
157def extract_pii_image( 158 image_path: str, 159 labels: Optional[Union[List[str], str]] = None 160) -> Optional[Dict[str, List[str]]]: 161 """ 162 Extract PII from an image using OCR. 163 164 Args: 165 image_path (str): Path to the image file. 166 labels (Optional[Union[List[str], str]]): Labels to filter. 167 168 Returns: 169 Optional[Dict[str, List[str]]]: Extraction results, or None. 170 """ 171 path = clean_path(image_path) 172 if not path or not os.path.isfile(path): 173 print(f"[red]Invalid image path:[/] {image_path}") 174 return None 175 text = extract_text(path) 176 if not text: 177 return None 178 data = extract_pii_text(text, labels) 179 return data or None
Extract PII from an image using OCR.
Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
182def extract_pii_screenshot( 183 labels: Optional[Union[List[str], str]] = None 184) -> Optional[Dict[str, List[str]]]: 185 """ 186 Capture a screenshot and extract PII from its OCR text. 187 188 Args: 189 labels (Optional[Union[List[str], str]]): Labels to filter. 190 191 Returns: 192 Optional[Dict[str, List[str]]]: Extraction results, or None. 193 """ 194 text = text_from_screenshot() 195 if not text: 196 return None 197 data = extract_pii_text(text, labels) 198 return data or None
Capture a screenshot and extract PII from its OCR text.
Args: labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
180def extract_text( 181 file_path: str 182) -> Optional[str]: 183 """ 184 Extract text content from a local file or URL. 185 186 Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio. 187 188 Args: 189 file_path (str): Path to the input file or URL. 190 191 Returns: 192 Optional[str]: Extracted text, or None if unsupported or error. 193 """ 194 if is_url(file_path): 195 return text_from_url(file_path) 196 197 TEXT_MIME_TYPES = { 198 "application/json", "application/xml", "application/x-yaml", 199 "application/x-toml", "application/x-csv", "application/x-markdown", 200 } 201 202 path = clean_path(file_path) 203 if not path: 204 print(f"No such file: {file_path}") 205 return None 206 207 mime_type = magic.from_file(path, mime=True) 208 try: 209 if mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES: 210 with open(path, 'r', encoding='utf-8', errors='ignore') as f: 211 content = f.read() 212 elif mime_type in [ 213 "application/vnd.ms-excel", 214 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 215 ]: 216 content = text_from_excel(path) 217 elif mime_type == "application/pdf": 218 content = text_from_pdf(path) 219 elif mime_type == \ 220 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 221 content = text_from_docx(path) 222 elif mime_type == "application/msword": 223 content = text_from_doc(path) 224 elif mime_type.startswith("image/"): 225 content = text_from_image(path) 226 elif mime_type.startswith("audio/"): 227 content = text_from_audio(path) 228 else: 229 content = text_from_any(path) 230 231 if content: 232 return content 233 else: 234 print(f"No content found for file: {path}") 235 return None 236 except Exception as e: 237 print(f"Error reading {path}: {e}") 238 return None
Extract text content from a local file or URL.
Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
Args: file_path (str): Path to the input file or URL.
Returns: Optional[str]: Extracted text, or None if unsupported or error.
126def extract_exif( 127 file_path: str 128) -> Optional[Dict[str, Any]]: 129 """ 130 Extract EXIF metadata from a file using exiftool. 131 132 Args: 133 file_path (str): Path to the target file. 134 135 Returns: 136 Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure. 137 """ 138 exif_data: Optional[Dict[str, Any]] = None 139 try: 140 result = subprocess.run( 141 ['exiftool', '-j', file_path], 142 stdout=subprocess.PIPE, 143 stderr=subprocess.PIPE 144 ) 145 if result.returncode == 0: 146 exif_data = json.loads(result.stdout.decode())[0] 147 except Exception as e: 148 print(f"Exiftool failed: {e}") 149 return exif_data
Extract EXIF metadata from a file using exiftool.
Args: file_path (str): Path to the target file.
Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
524def extract_metadata( 525 file_path: str 526) -> Dict[str, Any]: 527 """ 528 Extract comprehensive metadata from any file type. 529 530 Args: 531 file_path (str): Path to target file. 532 533 Returns: 534 Dict[str, Any]: Nested metadata structure. 535 """ 536 path = clean_path(file_path) 537 if not path: 538 return {"error": "File not found"} 539 meta: Dict[str, Any] = {} 540 try: 541 stats = os.stat(path) 542 meta["size_bytes"] = stats.st_size 543 meta["mime"] = magic.from_file(path, mime=True) 544 meta["hashes"] = { 545 "md5": hashlib.md5(open(path,'rb').read()).hexdigest()} 546 except Exception as e: 547 meta["error"] = str(e) 548 return meta
Extract comprehensive metadata from any file type.
Args: file_path (str): Path to target file.
Returns: Dict[str, Any]: Nested metadata structure.
152def text_from_url( 153 url: str 154) -> Optional[str]: 155 """ 156 Fetch and extract visible text from a web page. 157 158 Args: 159 url (str): The target webpage URL. 160 161 Returns: 162 Optional[str]: Extracted text, or None on failure. 163 """ 164 try: 165 response = requests.get(url, timeout=10) 166 response.raise_for_status() 167 soup = BeautifulSoup(response.text, "html.parser") 168 for tag in soup( 169 ["script", "style", "noscript", "iframe", 170 "header", "footer", "meta", "link"] 171 ): 172 tag.decompose() 173 content = soup.get_text(separator=" ").strip() 174 return normalize(content) 175 except requests.RequestException as e: 176 print(f"Error fetching URL: {url} - {e}") 177 return None
Fetch and extract visible text from a web page.
Args: url (str): The target webpage URL.
Returns: Optional[str]: Extracted text, or None on failure.
241def text_from_audio( 242 audio_file: str 243) -> Optional[str]: 244 """ 245 Transcribe audio to text via Google Speech Recognition. 246 247 Args: 248 audio_file (str): Path to the audio file. 249 250 Returns: 251 Optional[str]: Transcription, or None on error. 252 """ 253 def audio_to_wav( 254 file_path: str 255 ) -> str: 256 _, ext = os.path.splitext(file_path) 257 ext = ext.lstrip('.') 258 audio = AudioSegment.from_file(file_path, format=ext) 259 wav_path = file_path.replace(f".{ext}", ".wav") 260 audio.export(wav_path, format='wav') 261 return wav_path 262 263 _, ext = os.path.splitext(audio_file) 264 if ext.lower() not in ['.wav', '.wave']: 265 audio_file = audio_to_wav(audio_file) 266 recognizer = sr.Recognizer() 267 try: 268 with sr.AudioFile(audio_file) as source: 269 audio = recognizer.record(source) 270 return recognizer.recognize_google(audio) 271 except sr.UnknownValueError: 272 print("Could not understand audio") 273 except sr.RequestError as e: 274 print(f"Speech service error: {e}") 275 return None
Transcribe audio to text via Google Speech Recognition.
Args: audio_file (str): Path to the audio file.
Returns: Optional[str]: Transcription, or None on error.
322def text_from_pdf( 323 pdf_path: str 324) -> Optional[str]: 325 """ 326 Extract text and image OCR from a PDF using PyMuPDF. 327 328 Args: 329 pdf_path (str): Path to PDF file. 330 331 Returns: 332 Optional[str]: Combined text and OCR results, or None on error. 333 """ 334 plain_text = "" 335 try: 336 doc = pymupdf.open(pdf_path) 337 # metadata 338 for k, v in doc.metadata.items(): 339 plain_text += f"{k}: {v}\n" 340 for i in range(len(doc)): 341 page = doc.load_page(i) 342 plain_text += f"\n--- Page {i+1} ---\n" 343 txt = page.get_text() 344 plain_text += txt or "[No text]\n" 345 for img_index, img in enumerate(page.get_images(full=True), start=1): 346 xref = img[0] 347 base = doc.extract_image(xref) 348 img_bytes = base["image"] 349 img_path = f"/tmp/page{i+1}-img{img_index}.png" 350 with open(img_path, "wb") as img_file: 351 img_file.write(img_bytes) 352 ocr = text_from_image(img_path) or "" 353 plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n" 354 doc.close() 355 return normalize(plain_text) 356 except Exception as e: 357 print(f"Error processing PDF: {e}") 358 return None
Extract text and image OCR from a PDF using PyMuPDF.
Args: pdf_path (str): Path to PDF file.
Returns: Optional[str]: Combined text and OCR results, or None on error.
361def text_from_doc( 362 filepath: str, 363 min_length: int = 4 364) -> str: 365 """ 366 Extract readable strings and metadata from binary Word (.doc) files. 367 368 Args: 369 filepath (str): Path to .doc file. 370 min_length (int): Minimum string length to extract. 371 372 Returns: 373 str: Metadata and text content. 374 """ 375 def extract_printable_strings( 376 data: bytes 377 ) -> List[str]: 378 pattern = re.compile( 379 b'[' + re.escape(bytes(string.printable, 'ascii')) + 380 b']{%d,}' % min_length 381 ) 382 found = pattern.findall(data) 383 return list(dict.fromkeys(m.decode(errors='ignore').strip() 384 for m in found)) 385 386 def clean_strings( 387 strs: List[str] 388 ) -> List[str]: 389 cleaned: List[str] = [] 390 skip = ["HYPERLINK", "OLE2", "Normal.dotm"] 391 for line in strs: 392 if any(line.startswith(pref) for pref in skip): 393 continue 394 cleaned.append(re.sub(r'\s+', ' ', line).strip()) 395 return cleaned 396 397 with open(filepath, 'rb') as f: 398 data = f.read() 399 strings = extract_printable_strings(data) 400 strings = clean_strings(strings) 401 content = "\n".join(strings) 402 return normalize(content)
Extract readable strings and metadata from binary Word (.doc) files.
Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.
Returns: str: Metadata and text content.
405def text_from_docx( 406 file_path: str 407) -> Optional[str]: 408 """ 409 Extract text, tables, and OCR images from a DOCX file. 410 411 Args: 412 file_path (str): Path to the .docx file. 413 414 Returns: 415 Optional[str]: Combined document text, or None on error. 416 """ 417 path = clean_path(file_path) 418 if not path: 419 return None 420 doc = Document(path) 421 plain_text = "" 422 try: 423 for p in doc.paragraphs: 424 if p.text.strip(): 425 plain_text += p.text.strip() + "\n" 426 for tbl in doc.tables: 427 plain_text += "\n[Table]\n" 428 for row in tbl.rows: 429 plain_text += "\t".join(c.text.strip() 430 for c in row.cells) + "\n" 431 for rel in doc.part.rels: 432 if "image" in doc.part.rels[rel].target_ref: 433 blob = doc.part.rels[rel].target_part.blob 434 img_path = f"/tmp/docx_img_{rel}.png" 435 with open(img_path, "wb") as img_f: 436 img_f.write(blob) 437 ocr = text_from_image(img_path) or "" 438 plain_text += f"\n[Image OCR]\n{ocr}\n" 439 return normalize(plain_text) 440 except Exception as e: 441 print(f"Error processing DOCX: {e}") 442 return None
Extract text, tables, and OCR images from a DOCX file.
Args: file_path (str): Path to the .docx file.
Returns: Optional[str]: Combined document text, or None on error.
445def text_from_excel( 446 file_path: str 447) -> str: 448 """ 449 Convert an Excel workbook to CSV text. 450 451 Args: 452 file_path (str): Path to the Excel file. 453 454 Returns: 455 str: CSV-formatted string. 456 """ 457 path = clean_path(file_path) 458 if not path: 459 return "" 460 try: 461 df = pd.read_excel(path) 462 out = StringIO() 463 df.to_csv(out, index=False) 464 return out.getvalue() 465 except Exception as e: 466 print(f"Failed Excel -> CSV: {e}") 467 return ""
Convert an Excel workbook to CSV text.
Args: file_path (str): Path to the Excel file.
Returns: str: CSV-formatted string.
470def text_from_image( 471 file_path: str 472) -> Optional[str]: 473 """ 474 Perform OCR on an image file. 475 476 Args: 477 file_path (str): Path to the image. 478 479 Returns: 480 Optional[str]: Extracted text, or None on error. 481 """ 482 path = clean_path(file_path) 483 if not path: 484 return None 485 try: 486 with Image.open(path) as img: 487 txt = pytesseract.image_to_string(img).strip() 488 return normalize(txt) or "" 489 except Exception as e: 490 print(f"Failed image OCR: {e}") 491 return None
Perform OCR on an image file.
Args: file_path (str): Path to the image.
Returns: Optional[str]: Extracted text, or None on error.
494def text_from_any( 495 file_path: str 496) -> Optional[str]: 497 """ 498 Handle unknown file types by reporting stats and metadata. 499 500 Args: 501 file_path (str): Path to the file. 502 503 Returns: 504 Optional[str]: Plain-text report, or None on error. 505 """ 506 path = clean_path(file_path) 507 if not path: 508 return None 509 try: 510 stats = os.stat(path) 511 info = { 512 "path": path, 513 "size": stats.st_size, 514 "created": datetime.fromtimestamp(stats.st_ctime).isoformat(), 515 "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(), 516 } 517 content = "\n".join(f"{k}: {v}" for k, v in info.items()) 518 return normalize(content) 519 except Exception as e: 520 print(f"Error on other file: {e}") 521 return None
Handle unknown file types by reporting stats and metadata.
Args: file_path (str): Path to the file.
Returns: Optional[str]: Plain-text report, or None on error.