mrblack
1#!/usr/bin/env python3 2# -*- coding: utf-8 -*- 3# 4# File: __init__.py 5# Author: Wadih Khairallah 6# Description: 7# Created: 2025-05-12 16:47:22 8# Modified: 2025-05-12 18:07:52 9 10from .pii import ( 11 extract as pii_text, 12 file as pii_file, 13 url as pii_url 14) 15from .textextract import ( 16 extract_text, 17 extract_exif, 18 extract_metadata, 19 text_from_url, 20 text_from_audio, 21 text_from_pdf, 22 text_from_doc, 23 text_from_docx, 24 text_from_excel, 25 text_from_image, 26 text_from_any 27) 28 29__all__ = [ 30 "pii_text", 31 "pii_file", 32 "pii_url", 33 "extract_text", 34 "extract_exif", 35 "extract_metadata", 36 "text_from_url", 37 "text_from_audio", 38 "text_from_pdf", 39 "text_from_doc", 40 "text_from_docx", 41 "text_from_excel", 42 "text_from_image", 43 "text_from_any" 44]
73def extract( 74 text: str, 75 labels: Optional[Union[List[str], str]] = None 76) -> Dict[str, List[str]]: 77 """ 78 Extract PII matches from provided text. 79 80 Args: 81 text (str): The input text to scan for patterns. 82 labels (Optional[Union[List[str], str]]): Specific labels to filter on. 83 84 Returns: 85 Dict[str, List[str]]: Mapping of each label to a sorted list of 86 matched and cleaned strings. 87 """ 88 if isinstance(labels, str): 89 labels = [labels] 90 patterns = PATTERNS 91 if labels: 92 patterns = [ 93 p for p in PATTERNS 94 if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels) 95 ] 96 results: Dict[str, set] = defaultdict(set) 97 for pattern in patterns: 98 try: 99 rx = re.compile(pattern) 100 for m in rx.finditer(text): 101 for lbl, val in m.groupdict().items(): 102 if not val: 103 continue 104 cleaned = _clean_value(lbl, val) 105 if lbl == "url": 106 cleaned = cleaned.rstrip("),.**") 107 if cleaned is not None: 108 results[lbl].add(cleaned) 109 except re.error as e: 110 print( 111 f"Invalid regex skipped: {pattern} → {e}", 112 file=sys.stderr 113 ) 114 return {lbl: sorted(vals) for lbl, vals in results.items()}
Extract PII matches from provided text.
Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.
Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.
117def file( 118 file_path: str, 119 labels: Optional[Union[List[str], str]] = None 120) -> Optional[Dict[str, List[str]]]: 121 """ 122 Extract PII from a single file's text content. 123 124 Args: 125 file_path (str): Path to the file. 126 labels (Optional[Union[List[str], str]]): Labels to filter. 127 128 Returns: 129 Optional[Dict[str, List[str]]]: Extraction results, or None. 130 """ 131 text = extract_text(file_path) 132 if not text: 133 return None 134 data = extract(text, labels) 135 return data or None
Extract PII from a single file's text content.
Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
138def url( 139 path: str, 140 labels: Optional[Union[List[str], str]] = None 141) -> Optional[Dict[str, List[str]]]: 142 """ 143 Extract PII from the text at a URL. 144 145 Args: 146 path (str): The URL to fetch. 147 labels (Optional[Union[List[str], str]]): Labels to filter. 148 149 Returns: 150 Optional[Dict[str, List[str]]]: Extraction results, or None. 151 """ 152 text = text_from_url(path) 153 if not text: 154 return None 155 data = extract(text, labels) 156 return data or None
Extract PII from the text at a URL.
Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.
Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.
152def extract_text( 153 file_path: str 154) -> Optional[str]: 155 """ 156 Extract text content from a file based on MIME type. 157 158 Supports text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio. 159 160 Args: 161 file_path (str): Path to the input file. 162 163 Returns: 164 Optional[str]: Extracted text, or None if unsupported or error. 165 """ 166 TEXT_MIME_TYPES = { 167 # programming, config, data types... 168 "application/json", "application/xml", "application/x-yaml", 169 "application/x-toml", "application/x-csv", "application/x-markdown", 170 # add others as needed 171 } 172 173 path = clean_path(file_path) 174 if not path: 175 print(f"No such file: {file_path}") 176 return None 177 178 mime_type = magic.from_file(path, mime=True) 179 try: 180 if mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES: 181 with open(path, 'r', encoding='utf-8', errors='ignore') as f: 182 content = f.read() 183 elif mime_type in [ 184 "application/vnd.ms-excel", 185 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 186 ]: 187 content = text_from_excel(path) 188 elif mime_type == "application/pdf": 189 content = text_from_pdf(path) 190 elif mime_type == \ 191 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": 192 content = text_from_docx(path) 193 elif mime_type == "application/msword": 194 content = text_from_doc(path) # legacy .doc 195 elif mime_type.startswith("image/"): 196 content = text_from_image(path) 197 elif mime_type.startswith("audio/"): 198 content = text_from_audio(path) 199 else: 200 content = text_from_any(path) 201 202 if content: 203 return content 204 else: 205 print(f"No content found for file: {path}") 206 return None 207 except Exception as e: 208 print(f"Error reading {path}: {e}") 209 return None
Extract text content from a file based on MIME type.
Supports text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
Args: file_path (str): Path to the input file.
Returns: Optional[str]: Extracted text, or None if unsupported or error.
99def extract_exif( 100 file_path: str 101) -> Optional[Dict[str, Any]]: 102 """ 103 Extract EXIF metadata from a file using exiftool. 104 105 Args: 106 file_path (str): Path to the target file. 107 108 Returns: 109 Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure. 110 """ 111 exif_data: Optional[Dict[str, Any]] = None 112 try: 113 result = subprocess.run( 114 ['exiftool', '-j', file_path], 115 stdout=subprocess.PIPE, 116 stderr=subprocess.PIPE 117 ) 118 if result.returncode == 0: 119 exif_data = json.loads(result.stdout.decode())[0] 120 except Exception as e: 121 print(f"Exiftool failed: {e}") 122 return exif_data
Extract EXIF metadata from a file using exiftool.
Args: file_path (str): Path to the target file.
Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
493def extract_metadata( 494 file_path: str 495) -> Dict[str, Any]: 496 """ 497 Extract comprehensive metadata from any file type. 498 499 Args: 500 file_path (str): Path to target file. 501 502 Returns: 503 Dict[str, Any]: Nested metadata structure. 504 """ 505 path = clean_path(file_path) 506 if not path: 507 return {"error": "File not found"} 508 meta: Dict[str, Any] = {} 509 try: 510 stats = os.stat(path) 511 meta["size_bytes"] = stats.st_size 512 meta["mime"] = magic.from_file(path, mime=True) 513 meta["hashes"] = { 514 "md5": hashlib.md5(open(path,'rb').read()).hexdigest()} 515 except Exception as e: 516 meta["error"] = str(e) 517 return meta
Extract comprehensive metadata from any file type.
Args: file_path (str): Path to target file.
Returns: Dict[str, Any]: Nested metadata structure.
125def text_from_url( 126 url: str 127) -> Optional[str]: 128 """ 129 Fetch and extract visible text from a web page. 130 131 Args: 132 url (str): The target webpage URL. 133 134 Returns: 135 Optional[str]: Extracted text, or None on failure. 136 """ 137 try: 138 response = requests.get(url, timeout=10) 139 response.raise_for_status() 140 soup = BeautifulSoup(response.text, "html.parser") 141 for tag in soup( 142 ["script", "style", "noscript", "iframe", 143 "header", "footer", "meta", "link"] 144 ): 145 tag.decompose() 146 return soup.get_text(separator=" ").strip() 147 except requests.RequestException as e: 148 print(f"Error fetching URL: {url} - {e}") 149 return None
Fetch and extract visible text from a web page.
Args: url (str): The target webpage URL.
Returns: Optional[str]: Extracted text, or None on failure.
212def text_from_audio( 213 audio_file: str 214) -> Optional[str]: 215 """ 216 Transcribe audio to text via Google Speech Recognition. 217 218 Args: 219 audio_file (str): Path to the audio file. 220 221 Returns: 222 Optional[str]: Transcription, or None on error. 223 """ 224 def audio_to_wav( 225 file_path: str 226 ) -> str: 227 _, ext = os.path.splitext(file_path) 228 ext = ext.lstrip('.') 229 audio = AudioSegment.from_file(file_path, format=ext) 230 wav_path = file_path.replace(f".{ext}", ".wav") 231 audio.export(wav_path, format='wav') 232 return wav_path 233 234 _, ext = os.path.splitext(audio_file) 235 if ext.lower() not in ['.wav', '.wave']: 236 audio_file = audio_to_wav(audio_file) 237 recognizer = sr.Recognizer() 238 try: 239 with sr.AudioFile(audio_file) as source: 240 audio = recognizer.record(source) 241 return recognizer.recognize_google(audio) 242 except sr.UnknownValueError: 243 print("Could not understand audio") 244 except sr.RequestError as e: 245 print(f"Speech service error: {e}") 246 return None
Transcribe audio to text via Google Speech Recognition.
Args: audio_file (str): Path to the audio file.
Returns: Optional[str]: Transcription, or None on error.
293def text_from_pdf( 294 pdf_path: str 295) -> Optional[str]: 296 """ 297 Extract text and image OCR from a PDF using PyMuPDF. 298 299 Args: 300 pdf_path (str): Path to PDF file. 301 302 Returns: 303 Optional[str]: Combined text and OCR results, or None on error. 304 """ 305 plain_text = "" 306 try: 307 doc = fitz.open(pdf_path) 308 # metadata 309 for k, v in doc.metadata.items(): 310 plain_text += f"{k}: {v}\n" 311 for i in range(len(doc)): 312 page = doc.load_page(i) 313 plain_text += f"\n--- Page {i+1} ---\n" 314 txt = page.get_text() 315 plain_text += txt or "[No text]\n" 316 for img_index, img in enumerate(page.get_images(full=True), start=1): 317 xref = img[0] 318 base = doc.extract_image(xref) 319 img_bytes = base["image"] 320 img_path = f"/tmp/page{i+1}-img{img_index}.png" 321 with open(img_path, "wb") as img_file: 322 img_file.write(img_bytes) 323 ocr = text_from_image(img_path) or "" 324 plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n" 325 doc.close() 326 return plain_text 327 except Exception as e: 328 print(f"Error processing PDF: {e}") 329 return None
Extract text and image OCR from a PDF using PyMuPDF.
Args: pdf_path (str): Path to PDF file.
Returns: Optional[str]: Combined text and OCR results, or None on error.
332def text_from_doc( 333 filepath: str, 334 min_length: int = 4 335) -> str: 336 """ 337 Extract readable strings and metadata from binary Word (.doc) files. 338 339 Args: 340 filepath (str): Path to .doc file. 341 min_length (int): Minimum string length to extract. 342 343 Returns: 344 str: Metadata and text content. 345 """ 346 def extract_printable_strings( 347 data: bytes 348 ) -> List[str]: 349 pattern = re.compile( 350 b'[' + re.escape(bytes(string.printable, 'ascii')) + 351 b']{%d,}' % min_length 352 ) 353 found = pattern.findall(data) 354 return list(dict.fromkeys(m.decode(errors='ignore').strip() 355 for m in found)) 356 357 def clean_strings( 358 strs: List[str] 359 ) -> List[str]: 360 cleaned: List[str] = [] 361 skip = ["HYPERLINK", "OLE2", "Normal.dotm"] 362 for line in strs: 363 if any(line.startswith(pref) for pref in skip): 364 continue 365 cleaned.append(re.sub(r'\s+', ' ', line).strip()) 366 return cleaned 367 368 with open(filepath, 'rb') as f: 369 data = f.read() 370 strings = extract_printable_strings(data) 371 strings = clean_strings(strings) 372 return "\n".join(strings)
Extract readable strings and metadata from binary Word (.doc) files.
Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.
Returns: str: Metadata and text content.
375def text_from_docx( 376 file_path: str 377) -> Optional[str]: 378 """ 379 Extract text, tables, and OCR images from a DOCX file. 380 381 Args: 382 file_path (str): Path to the .docx file. 383 384 Returns: 385 Optional[str]: Combined document text, or None on error. 386 """ 387 path = clean_path(file_path) 388 if not path: 389 return None 390 doc = Document(path) 391 plain_text = "" 392 try: 393 for p in doc.paragraphs: 394 if p.text.strip(): 395 plain_text += p.text.strip() + "\n" 396 for tbl in doc.tables: 397 plain_text += "\n[Table]\n" 398 for row in tbl.rows: 399 plain_text += "\t".join(c.text.strip() 400 for c in row.cells) + "\n" 401 for rel in doc.part.rels: 402 if "image" in doc.part.rels[rel].target_ref: 403 blob = doc.part.rels[rel].target_part.blob 404 img_path = f"/tmp/docx_img_{rel}.png" 405 with open(img_path, "wb") as img_f: 406 img_f.write(blob) 407 ocr = text_from_image(img_path) or "" 408 plain_text += f"\n[Image OCR]\n{ocr}\n" 409 return plain_text 410 except Exception as e: 411 print(f"Error processing DOCX: {e}") 412 return None
Extract text, tables, and OCR images from a DOCX file.
Args: file_path (str): Path to the .docx file.
Returns: Optional[str]: Combined document text, or None on error.
415def text_from_excel( 416 file_path: str 417) -> str: 418 """ 419 Convert an Excel workbook to CSV text. 420 421 Args: 422 file_path (str): Path to the Excel file. 423 424 Returns: 425 str: CSV-formatted string. 426 """ 427 path = clean_path(file_path) 428 if not path: 429 return "" 430 try: 431 df = pd.read_excel(path) 432 out = StringIO() 433 df.to_csv(out, index=False) 434 return out.getvalue() 435 except Exception as e: 436 print(f"Failed Excel -> CSV: {e}") 437 return ""
Convert an Excel workbook to CSV text.
Args: file_path (str): Path to the Excel file.
Returns: str: CSV-formatted string.
440def text_from_image( 441 file_path: str 442) -> Optional[str]: 443 """ 444 Perform OCR on an image file. 445 446 Args: 447 file_path (str): Path to the image. 448 449 Returns: 450 Optional[str]: Extracted text, or None on error. 451 """ 452 path = clean_path(file_path) 453 if not path: 454 return None 455 try: 456 with Image.open(path) as img: 457 txt = pytesseract.image_to_string(img).strip() 458 return txt or "" 459 except Exception as e: 460 print(f"Failed image OCR: {e}") 461 return None
Perform OCR on an image file.
Args: file_path (str): Path to the image.
Returns: Optional[str]: Extracted text, or None on error.
464def text_from_any( 465 file_path: str 466) -> Optional[str]: 467 """ 468 Handle unknown file types by reporting stats and metadata. 469 470 Args: 471 file_path (str): Path to the file. 472 473 Returns: 474 Optional[str]: Plain-text report, or None on error. 475 """ 476 path = clean_path(file_path) 477 if not path: 478 return None 479 try: 480 stats = os.stat(path) 481 info = { 482 "path": path, 483 "size": stats.st_size, 484 "created": datetime.fromtimestamp(stats.st_ctime).isoformat(), 485 "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(), 486 } 487 return "\n".join(f"{k}: {v}" for k, v in info.items()) 488 except Exception as e: 489 print(f"Error on other file: {e}") 490 return None
Handle unknown file types by reporting stats and metadata.
Args: file_path (str): Path to the file.
Returns: Optional[str]: Plain-text report, or None on error.