mrblack

 1#!/usr/bin/env python3
 2# -*- coding: utf-8 -*-
 3#
 4# File: __init__.py
 5# Author: Wadih Khairallah
 6# Description: 
 7# Created: 2025-05-12 16:47:22
 8# Modified: 2025-05-15 03:16:18
 9
10from .pii import (
11    extract_pii_text,
12    extract_pii_file,
13    extract_pii_url,
14    extract_pii_image,
15    extract_pii_screenshot
16)
17from .textextract import (
18    extract_text,
19    extract_exif,
20    extract_metadata,
21    text_from_screenshot,
22    text_from_url,
23    text_from_html,
24    text_from_audio,
25    text_from_pdf,
26    text_from_doc,
27    text_from_docx,
28    text_from_excel,
29    text_from_image,
30    text_from_any
31)
32
33__all__ = [
34    "extract_pii_text",
35    "extract_pii_file",
36    "extract_pii_url",
37    "extract_pii_image",
38    "extract_pii_screenshot",
39    "extract_text",
40    "extract_exif",
41    "extract_metadata",
42    "text_from_url",
43    "text_from_html",
44    "text_from_audio",
45    "text_from_pdf",
46    "text_from_doc",
47    "text_from_docx",
48    "text_from_excel",
49    "text_from_image",
50    "text_from_any"
51]
def extract_pii_text( text: str, labels: Union[List[str], str, NoneType] = None) -> Dict[str, List[str]]:
 71def extract_pii_text(
 72    text: str,
 73    labels: Optional[Union[List[str], str]] = None
 74) -> Dict[str, List[str]]:
 75    """
 76    Extract PII matches from provided text.
 77
 78    Args:
 79        text (str): The input text to scan for patterns.
 80        labels (Optional[Union[List[str], str]]): Specific labels to filter on.
 81
 82    Returns:
 83        Dict[str, List[str]]: Mapping of each label to a sorted list of
 84        matched and cleaned strings.
 85    """
 86    if isinstance(labels, str):
 87        labels = [labels]
 88    patterns = PATTERNS
 89    if labels:
 90        patterns = [
 91            p for p in PATTERNS
 92            if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels)
 93        ]
 94    results: Dict[str, set] = defaultdict(set)
 95    for pattern in patterns:
 96        try:
 97            rx = re.compile(pattern)
 98            for m in rx.finditer(text):
 99                for lbl, val in m.groupdict().items():
100                    if not val:
101                        continue
102                    cleaned = _clean_value(lbl, val)
103                    if lbl == "url":
104                        cleaned = cleaned.rstrip("),.**")
105                    if cleaned is not None:
106                        results[lbl].add(cleaned)
107        except re.error as e:
108            print(
109                f"Invalid regex skipped: {pattern}{e}",
110                file=sys.stderr
111            )
112    return {lbl: sorted(vals) for lbl, vals in results.items()}

Extract PII matches from provided text.

Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.

Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.

def extract_pii_file( file_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
115def extract_pii_file(
116    file_path: str,
117    labels: Optional[Union[List[str], str]] = None
118) -> Optional[Dict[str, List[str]]]:
119    """
120    Extract PII from a single file's text content.
121
122    Args:
123        file_path (str): Path to the file.
124        labels (Optional[Union[List[str], str]]): Labels to filter.
125
126    Returns:
127        Optional[Dict[str, List[str]]]: Extraction results, or None.
128    """
129    text = extract_text(file_path)
130    if not text:
131        return None
132    data = extract_pii_text(text, labels)
133    return data or None

Extract PII from a single file's text content.

Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_url( path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
136def extract_pii_url(
137    path: str,
138    labels: Optional[Union[List[str], str]] = None
139) -> Optional[Dict[str, List[str]]]:
140    """
141    Extract PII from the text at a URL.
142
143    Args:
144        path (str): The URL to fetch.
145        labels (Optional[Union[List[str], str]]): Labels to filter.
146
147    Returns:
148        Optional[Dict[str, List[str]]]: Extraction results, or None.
149    """
150    text = text_from_url(path)
151    if not text:
152        return None
153    data = extract_pii_text(text, labels)
154    return data or None

Extract PII from the text at a URL.

Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_image( image_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
157def extract_pii_image(
158    image_path: str,
159    labels: Optional[Union[List[str], str]] = None
160) -> Optional[Dict[str, List[str]]]:
161    """
162    Extract PII from an image using OCR.
163
164    Args:
165        image_path (str): Path to the image file.
166        labels (Optional[Union[List[str], str]]): Labels to filter.
167
168    Returns:
169        Optional[Dict[str, List[str]]]: Extraction results, or None.
170    """
171    path = clean_path(image_path)
172    if not path or not os.path.isfile(path):
173        print(f"[red]Invalid image path:[/] {image_path}")
174        return None
175    text = extract_text(path)
176    if not text:
177        return None
178    data = extract_pii_text(text, labels)
179    return data or None

Extract PII from an image using OCR.

Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_screenshot( labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
182def extract_pii_screenshot(
183    labels: Optional[Union[List[str], str]] = None
184) -> Optional[Dict[str, List[str]]]:
185    """
186    Capture a screenshot and extract PII from its OCR text.
187
188    Args:
189        labels (Optional[Union[List[str], str]]): Labels to filter.
190
191    Returns:
192        Optional[Dict[str, List[str]]]: Extraction results, or None.
193    """
194    text = text_from_screenshot()
195    if not text:
196        return None
197    data = extract_pii_text(text, labels)
198    return data or None

Capture a screenshot and extract PII from its OCR text.

Args: labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_text(file_path: str) -> Optional[str]:
249def extract_text(
250    file_path: str
251) -> Optional[str]:
252    """
253    Extract text content from a local file or URL.
254
255    Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
256
257    Args:
258        file_path (str): Path to the input file or URL.
259
260    Returns:
261        Optional[str]: Extracted text, or None if unsupported or error.
262    """
263    if is_url(file_path):
264        return text_from_url(file_path)
265
266    TEXT_MIME_TYPES = {
267        "application/json", "application/xml", "application/x-yaml",
268        "application/x-toml", "application/x-csv", "application/x-markdown",
269    }
270
271    path = clean_path(file_path)
272    if not path:
273        print(f"No such file: {file_path}")
274        return None
275
276    mime_type = magic.from_file(path, mime=True)
277    try:
278        if mime_type.startswith("text/html"):
279            content = text_from_html(path)
280
281        if mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES:
282            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
283                content = f.read()
284
285        elif mime_type in [
286            "application/vnd.ms-excel",
287            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
288        ]:
289            content = text_from_excel(path)
290
291        elif mime_type == "application/pdf":
292            content = text_from_pdf(path)
293
294        elif mime_type == \
295            "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
296            content = text_from_docx(path)
297
298        elif mime_type == "application/msword":
299            content = text_from_doc(path)
300
301        elif mime_type.startswith("image/"):
302            content = text_from_image(path)
303
304        elif mime_type.startswith("audio/"):
305            content = text_from_audio(path)
306
307        else:
308            content = text_from_any(path)
309
310        if content:
311            return content
312        else:
313            print(f"No content found for file: {path}")
314            return None
315    except Exception as e:
316        print(f"Error reading {path}: {e}")
317        return None

Extract text content from a local file or URL.

Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.

Args: file_path (str): Path to the input file or URL.

Returns: Optional[str]: Extracted text, or None if unsupported or error.

def extract_exif(file_path: str) -> Optional[Dict[str, Any]]:
155def extract_exif(
156    file_path: str
157) -> Optional[Dict[str, Any]]:
158    """
159    Extract EXIF metadata from a file using exiftool.
160
161    Args:
162        file_path (str): Path to the target file.
163
164    Returns:
165        Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
166    """
167    exif_data: Optional[Dict[str, Any]] = None
168    try:
169        result = subprocess.run(
170            ['exiftool', '-j', file_path],
171            stdout=subprocess.PIPE,
172            stderr=subprocess.PIPE
173        )
174        if result.returncode == 0:
175            exif_data = json.loads(result.stdout.decode())[0]
176    except Exception as e:
177        print(f"Exiftool failed: {e}")
178    return exif_data

Extract EXIF metadata from a file using exiftool.

Args: file_path (str): Path to the target file.

Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.

def extract_metadata(file_path: str) -> Dict[str, Any]:
658def extract_metadata(
659    file_path: str
660) -> Dict[str, Any]:
661    """
662    Extract comprehensive metadata from any file type.
663
664    Args:
665        file_path (str): Path to target file.
666
667    Returns:
668        Dict[str, Any]: Nested metadata structure.
669    """
670    path = clean_path(file_path)
671    if not path:
672        return {"error": "File not found"}
673    meta: Dict[str, Any] = {}
674    try:
675        stats = os.stat(path)
676        meta["size_bytes"] = stats.st_size
677        meta["mime"] = magic.from_file(path, mime=True)
678        meta["hashes"] = {
679            "md5": hashlib.md5(open(path,'rb').read()).hexdigest()}
680    except Exception as e:
681        meta["error"] = str(e)
682    return meta

Extract comprehensive metadata from any file type.

Args: file_path (str): Path to target file.

Returns: Dict[str, Any]: Nested metadata structure.

def text_from_url(url: str, render_js: bool = True) -> Optional[str]:
209def text_from_url(
210    url: str,
211    render_js: bool = True
212) -> Optional[str]:
213    """
214    Fetch and extract all visible text from a web page, including JS-rendered content.
215
216    Args:
217        url (str): Target webpage URL.
218        render_js (bool): Whether to render JavaScript content.
219
220    Returns:
221        Optional[str]: Cleaned full-page text, or None on failure.
222    """
223    headers = {
224        "User-Agent": random.choice(USER_AGENTS),
225        "Accept-Language": "en-US,en;q=0.9",
226        "Referer": url,
227        "DNT": "1",
228        "Upgrade-Insecure-Requests": "1"
229    }
230
231    session = HTMLSession()
232    try:
233        r = session.get(url, headers=headers, timeout=20)
234        if render_js:
235            r.html.render(timeout=20, sleep=1)
236
237        html = r.html.html
238        content = text_from_html(html)
239
240        return content
241
242    except Exception as e:
243        print(f"[Error] {url} - {e}")
244        return None
245    finally:
246        session.close()

Fetch and extract all visible text from a web page, including JS-rendered content.

Args: url (str): Target webpage URL. render_js (bool): Whether to render JavaScript content.

Returns: Optional[str]: Cleaned full-page text, or None on failure.

def text_from_html(html: str) -> str:
181def text_from_html(html: str) -> str:
182    """
183    Extract readable text from raw HTML content.
184
185    Args:
186        html (str): HTML source as a string.
187
188    Returns:
189        str: Cleaned and normalized visible text.
190    """
191    soup = BeautifulSoup(html, "html.parser")
192
193    # Remove non-visible or structural elements
194    for tag in soup([
195        "script", "style",
196        "noscript", "iframe",
197        "meta", "link",
198        "header", "footer",
199        "form", "nav",
200        "aside"
201    ]):
202        tag.decompose()
203
204    text = soup.get_text(separator=" ")
205
206    return normalize(text)

Extract readable text from raw HTML content.

Args: html (str): HTML source as a string.

Returns: str: Cleaned and normalized visible text.

def text_from_audio(audio_file: str) -> Optional[str]:
319def text_from_audio(
320    audio_file: str
321) -> Optional[str]:
322    """
323    Transcribe audio to text using Google Speech Recognition.
324
325    Args:
326        audio_file (str): Path to the input audio file.
327
328    Returns:
329        Optional[str]: Transcription, or None on failure.
330    """
331    def convert_to_wav(file_path: str) -> str:
332        _, ext = os.path.splitext(file_path)
333        ext = ext.lstrip('.')
334        audio = AudioSegment.from_file(file_path, format=ext)
335        tmp_filename = f"audio_{uuid4().hex}.wav"
336        wav_path = os.path.join(tempfile.gettempdir(), tmp_filename)
337        audio.export(wav_path, format='wav')
338        return wav_path
339
340    recognizer = sr.Recognizer()
341    temp_wav_path = None
342    cleanup_needed = False
343
344    try:
345        _, ext = os.path.splitext(audio_file)
346        if ext.lower() not in ['.wav', '.wave']:
347            temp_wav_path = convert_to_wav(audio_file)
348            cleanup_needed = True
349        else:
350            temp_wav_path = clean_path(audio_file)
351
352        if not temp_wav_path:
353            print("Invalid audio path.")
354            return None
355
356        with sr.AudioFile(temp_wav_path) as source:
357            audio = recognizer.record(source)
358        return recognizer.recognize_google(audio)
359
360    except sr.UnknownValueError:
361        print("Could not understand audio.")
362    except sr.RequestError as e:
363        print(f"Speech recognition error: {e}")
364    except Exception as e:
365        print(f"Failed to process audio: {e}")
366    finally:
367        if cleanup_needed and temp_wav_path and os.path.exists(temp_wav_path):
368            try:
369                os.remove(temp_wav_path)
370            except Exception as e:
371                print(f"Failed to delete temp WAV file {temp_wav_path}: {e}")
372
373    return None

Transcribe audio to text using Google Speech Recognition.

Args: audio_file (str): Path to the input audio file.

Returns: Optional[str]: Transcription, or None on failure.

def text_from_pdf(pdf_path: str) -> Optional[str]:
420def text_from_pdf(
421    pdf_path: str
422) -> Optional[str]:
423    """
424    Extract text and OCR results from a PDF using PyMuPDF.
425
426    Args:
427        pdf_path (str): Path to PDF file.
428
429    Returns:
430        Optional[str]: Combined normalized text and image OCR results.
431    """
432    plain_text = ""
433    temp_image_paths: List[str] = []
434
435    try:
436        doc = pymupdf.open(pdf_path)
437        for k, v in doc.metadata.items():
438            plain_text += f"{k}: {v}\n"
439
440        for i in range(len(doc)):
441            page = doc.load_page(i)
442            plain_text += f"\n--- Page {i + 1} ---\n"
443            text = page.get_text()
444            plain_text += text or "[No text]\n"
445
446            for img_index, img in enumerate(page.get_images(full=True), start=1):
447                xref = img[0]
448                base = doc.extract_image(xref)
449                img_bytes = base["image"]
450
451                img_filename = f"pdf_page{i+1}_img{img_index}_{uuid4().hex}.png"
452                img_path = os.path.join(tempfile.gettempdir(), img_filename)
453                temp_image_paths.append(img_path)
454
455                with open(img_path, "wb") as f:
456                    f.write(img_bytes)
457
458                ocr = text_from_image(img_path) or ""
459                plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n"
460
461        return normalize(plain_text)
462    except Exception as e:
463        print(f"Error processing PDF: {e}")
464        return None
465    finally:
466        for path in temp_image_paths:
467            if os.path.exists(path):
468                try:
469                    os.remove(path)
470                except Exception as e:
471                    print(f"Failed to delete temp image {path}: {e}")
472        doc.close()

Extract text and OCR results from a PDF using PyMuPDF.

Args: pdf_path (str): Path to PDF file.

Returns: Optional[str]: Combined normalized text and image OCR results.

def text_from_doc(filepath: str, min_length: int = 4) -> str:
475def text_from_doc(
476    filepath: str,
477    min_length: int = 4
478) -> str:
479    """
480    Extract readable strings and metadata from binary Word (.doc) files.
481
482    Args:
483        filepath (str): Path to .doc file.
484        min_length (int): Minimum string length to extract.
485
486    Returns:
487        str: Metadata and text content.
488    """
489    def extract_printable_strings(
490        data: bytes
491    ) -> List[str]:
492        pattern = re.compile(
493            b'[' + re.escape(bytes(string.printable, 'ascii')) +
494            b']{%d,}' % min_length
495        )
496        found = pattern.findall(data)
497        return list(dict.fromkeys(m.decode(errors='ignore').strip()
498                                   for m in found))
499
500    def clean_strings(
501        strs: List[str]
502    ) -> List[str]:
503        cleaned: List[str] = []
504        skip = ["HYPERLINK", "OLE2", "Normal.dotm"]
505        for line in strs:
506            if any(line.startswith(pref) for pref in skip):
507                continue
508            cleaned.append(re.sub(r'\s+', ' ', line).strip())
509        return cleaned
510
511    with open(filepath, 'rb') as f:
512        data = f.read()
513    strings = extract_printable_strings(data)
514    strings = clean_strings(strings)
515    content = "\n".join(strings)
516    return normalize(content)

Extract readable strings and metadata from binary Word (.doc) files.

Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.

Returns: str: Metadata and text content.

def text_from_docx(file_path: str) -> Optional[str]:
519def text_from_docx(
520    file_path: str
521) -> Optional[str]:
522    """
523    Extract text, tables, and OCR from embedded images in a DOCX file.
524
525    Args:
526        file_path (str): Path to the .docx file.
527
528    Returns:
529        Optional[str]: Normalized full text content.
530    """
531    path = clean_path(file_path)
532    if not path:
533        return None
534
535    temp_image_paths: List[str] = []
536    plain_text = ""
537
538    try:
539        doc = Document(path)
540
541        for p in doc.paragraphs:
542            if p.text.strip():
543                plain_text += p.text.strip() + "\n"
544
545        for tbl in doc.tables:
546            plain_text += "\n[Table]\n"
547            for row in tbl.rows:
548                row_text = "\t".join(c.text.strip() for c in row.cells)
549                plain_text += row_text + "\n"
550
551        for rel_id, rel in doc.part.rels.items():
552            if "image" in rel.target_ref:
553                blob = rel.target_part.blob
554
555                img_filename = f"docx_img_{rel_id}_{uuid4().hex}.png"
556                img_path = os.path.join(tempfile.gettempdir(), img_filename)
557                temp_image_paths.append(img_path)
558
559                with open(img_path, "wb") as img_file:
560                    img_file.write(blob)
561
562                ocr = text_from_image(img_path) or ""
563                plain_text += f"\n[Image OCR]\n{ocr}\n"
564
565        return normalize(plain_text)
566
567    except Exception as e:
568        print(f"Error processing DOCX: {e}")
569        return None
570    finally:
571        for path in temp_image_paths:
572            if os.path.exists(path):
573                try:
574                    os.remove(path)
575                except Exception as e:
576                    print(f"Failed to delete temp DOCX image {path}: {e}")

Extract text, tables, and OCR from embedded images in a DOCX file.

Args: file_path (str): Path to the .docx file.

Returns: Optional[str]: Normalized full text content.

def text_from_excel(file_path: str) -> str:
579def text_from_excel(
580    file_path: str
581) -> str:
582    """
583    Convert an Excel workbook to CSV text.
584
585    Args:
586        file_path (str): Path to the Excel file.
587
588    Returns:
589        str: CSV-formatted string.
590    """
591    path = clean_path(file_path)
592    if not path:
593        return ""
594    try:
595        df = pd.read_excel(path)
596        out = StringIO()
597        df.to_csv(out, index=False)
598        return out.getvalue()
599    except Exception as e:
600        print(f"Failed Excel -> CSV: {e}")
601        return ""

Convert an Excel workbook to CSV text.

Args: file_path (str): Path to the Excel file.

Returns: str: CSV-formatted string.

def text_from_image(file_path: str) -> Optional[str]:
604def text_from_image(
605    file_path: str
606) -> Optional[str]:
607    """
608    Perform OCR on an image file.
609
610    Args:
611        file_path (str): Path to the image.
612
613    Returns:
614        Optional[str]: Extracted text, or None on error.
615    """
616    path = clean_path(file_path)
617    if not path:
618        return None
619    try:
620        with Image.open(path) as img:
621            txt = pytesseract.image_to_string(img).strip()
622            return normalize(txt) or ""
623    except Exception as e:
624        print(f"Failed image OCR: {e}")
625        return None

Perform OCR on an image file.

Args: file_path (str): Path to the image.

Returns: Optional[str]: Extracted text, or None on error.

def text_from_any(file_path: str) -> Optional[str]:
628def text_from_any(
629    file_path: str
630) -> Optional[str]:
631    """
632    Handle unknown file types by reporting stats and metadata.
633
634    Args:
635        file_path (str): Path to the file.
636
637    Returns:
638        Optional[str]: Plain-text report, or None on error.
639    """
640    path = clean_path(file_path)
641    if not path:
642        return None
643    try:
644        stats = os.stat(path)
645        info = {
646            "path": path,
647            "size": stats.st_size,
648            "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
649            "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
650        }
651        content = "\n".join(f"{k}: {v}" for k, v in info.items())
652        return normalize(content)
653    except Exception as e:
654        print(f"Error on other file: {e}")
655        return None

Handle unknown file types by reporting stats and metadata.

Args: file_path (str): Path to the file.

Returns: Optional[str]: Plain-text report, or None on error.