mrblack

 1#!/usr/bin/env python3
 2# -*- coding: utf-8 -*-
 3#
 4# File: __init__.py
 5# Author: Wadih Khairallah
 6# Description: 
 7# Created: 2025-05-12 16:47:22
 8# Modified: 2025-05-14 18:49:23
 9
10from .pii import (
11    extract_pii_text,
12    extract_pii_file,
13    extract_pii_url,
14    extract_pii_image,
15    extract_pii_screenshot
16)
17from .textextract import (
18    extract_text,
19    extract_exif,
20    extract_metadata,
21    text_from_screenshot,
22    text_from_url,
23    text_from_audio,
24    text_from_pdf,
25    text_from_doc,
26    text_from_docx,
27    text_from_excel,
28    text_from_image,
29    text_from_any
30)
31
32__all__ = [
33    "extract_pii_text",
34    "extract_pii_file",
35    "extract_pii_url",
36    "extract_pii_image",
37    "extract_pii_screenshot",
38    "extract_text",
39    "extract_exif",
40    "extract_metadata",
41    "text_from_url",
42    "text_from_audio",
43    "text_from_pdf",
44    "text_from_doc",
45    "text_from_docx",
46    "text_from_excel",
47    "text_from_image",
48    "text_from_any"
49]
def extract_pii_text( text: str, labels: Union[List[str], str, NoneType] = None) -> Dict[str, List[str]]:
 71def extract_pii_text(
 72    text: str,
 73    labels: Optional[Union[List[str], str]] = None
 74) -> Dict[str, List[str]]:
 75    """
 76    Extract PII matches from provided text.
 77
 78    Args:
 79        text (str): The input text to scan for patterns.
 80        labels (Optional[Union[List[str], str]]): Specific labels to filter on.
 81
 82    Returns:
 83        Dict[str, List[str]]: Mapping of each label to a sorted list of
 84        matched and cleaned strings.
 85    """
 86    if isinstance(labels, str):
 87        labels = [labels]
 88    patterns = PATTERNS
 89    if labels:
 90        patterns = [
 91            p for p in PATTERNS
 92            if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels)
 93        ]
 94    results: Dict[str, set] = defaultdict(set)
 95    for pattern in patterns:
 96        try:
 97            rx = re.compile(pattern)
 98            for m in rx.finditer(text):
 99                for lbl, val in m.groupdict().items():
100                    if not val:
101                        continue
102                    cleaned = _clean_value(lbl, val)
103                    if lbl == "url":
104                        cleaned = cleaned.rstrip("),.**")
105                    if cleaned is not None:
106                        results[lbl].add(cleaned)
107        except re.error as e:
108            print(
109                f"Invalid regex skipped: {pattern}{e}",
110                file=sys.stderr
111            )
112    return {lbl: sorted(vals) for lbl, vals in results.items()}

Extract PII matches from provided text.

Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.

Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.

def extract_pii_file( file_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
115def extract_pii_file(
116    file_path: str,
117    labels: Optional[Union[List[str], str]] = None
118) -> Optional[Dict[str, List[str]]]:
119    """
120    Extract PII from a single file's text content.
121
122    Args:
123        file_path (str): Path to the file.
124        labels (Optional[Union[List[str], str]]): Labels to filter.
125
126    Returns:
127        Optional[Dict[str, List[str]]]: Extraction results, or None.
128    """
129    text = extract_text(file_path)
130    if not text:
131        return None
132    data = extract_pii_text(text, labels)
133    return data or None

Extract PII from a single file's text content.

Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_url( path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
136def extract_pii_url(
137    path: str,
138    labels: Optional[Union[List[str], str]] = None
139) -> Optional[Dict[str, List[str]]]:
140    """
141    Extract PII from the text at a URL.
142
143    Args:
144        path (str): The URL to fetch.
145        labels (Optional[Union[List[str], str]]): Labels to filter.
146
147    Returns:
148        Optional[Dict[str, List[str]]]: Extraction results, or None.
149    """
150    text = text_from_url(path)
151    if not text:
152        return None
153    data = extract_pii_text(text, labels)
154    return data or None

Extract PII from the text at a URL.

Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_image( image_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
157def extract_pii_image(
158    image_path: str,
159    labels: Optional[Union[List[str], str]] = None
160) -> Optional[Dict[str, List[str]]]:
161    """
162    Extract PII from an image using OCR.
163
164    Args:
165        image_path (str): Path to the image file.
166        labels (Optional[Union[List[str], str]]): Labels to filter.
167
168    Returns:
169        Optional[Dict[str, List[str]]]: Extraction results, or None.
170    """
171    path = clean_path(image_path)
172    if not path or not os.path.isfile(path):
173        print(f"[red]Invalid image path:[/] {image_path}")
174        return None
175    text = extract_text(path)
176    if not text:
177        return None
178    data = extract_pii_text(text, labels)
179    return data or None

Extract PII from an image using OCR.

Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_screenshot( labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
182def extract_pii_screenshot(
183    labels: Optional[Union[List[str], str]] = None
184) -> Optional[Dict[str, List[str]]]:
185    """
186    Capture a screenshot and extract PII from its OCR text.
187
188    Args:
189        labels (Optional[Union[List[str], str]]): Labels to filter.
190
191    Returns:
192        Optional[Dict[str, List[str]]]: Extraction results, or None.
193    """
194    text = text_from_screenshot()
195    if not text:
196        return None
197    data = extract_pii_text(text, labels)
198    return data or None

Capture a screenshot and extract PII from its OCR text.

Args: labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_text(file_path: str) -> Optional[str]:
180def extract_text(
181    file_path: str
182) -> Optional[str]:
183    """
184    Extract text content from a local file or URL.
185
186    Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
187
188    Args:
189        file_path (str): Path to the input file or URL.
190
191    Returns:
192        Optional[str]: Extracted text, or None if unsupported or error.
193    """
194    if is_url(file_path):
195        return text_from_url(file_path)
196
197    TEXT_MIME_TYPES = {
198        "application/json", "application/xml", "application/x-yaml",
199        "application/x-toml", "application/x-csv", "application/x-markdown",
200    }
201
202    path = clean_path(file_path)
203    if not path:
204        print(f"No such file: {file_path}")
205        return None
206
207    mime_type = magic.from_file(path, mime=True)
208    try:
209        if mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES:
210            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
211                content = f.read()
212        elif mime_type in [
213            "application/vnd.ms-excel",
214            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
215        ]:
216            content = text_from_excel(path)
217        elif mime_type == "application/pdf":
218            content = text_from_pdf(path)
219        elif mime_type == \
220            "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
221            content = text_from_docx(path)
222        elif mime_type == "application/msword":
223            content = text_from_doc(path)
224        elif mime_type.startswith("image/"):
225            content = text_from_image(path)
226        elif mime_type.startswith("audio/"):
227            content = text_from_audio(path)
228        else:
229            content = text_from_any(path)
230
231        if content:
232            return content
233        else:
234            print(f"No content found for file: {path}")
235            return None
236    except Exception as e:
237        print(f"Error reading {path}: {e}")
238        return None

Extract text content from a local file or URL.

Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.

Args: file_path (str): Path to the input file or URL.

Returns: Optional[str]: Extracted text, or None if unsupported or error.

def extract_exif(file_path: str) -> Optional[Dict[str, Any]]:
126def extract_exif(
127    file_path: str
128) -> Optional[Dict[str, Any]]:
129    """
130    Extract EXIF metadata from a file using exiftool.
131
132    Args:
133        file_path (str): Path to the target file.
134
135    Returns:
136        Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
137    """
138    exif_data: Optional[Dict[str, Any]] = None
139    try:
140        result = subprocess.run(
141            ['exiftool', '-j', file_path],
142            stdout=subprocess.PIPE,
143            stderr=subprocess.PIPE
144        )
145        if result.returncode == 0:
146            exif_data = json.loads(result.stdout.decode())[0]
147    except Exception as e:
148        print(f"Exiftool failed: {e}")
149    return exif_data

Extract EXIF metadata from a file using exiftool.

Args: file_path (str): Path to the target file.

Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.

def extract_metadata(file_path: str) -> Dict[str, Any]:
524def extract_metadata(
525    file_path: str
526) -> Dict[str, Any]:
527    """
528    Extract comprehensive metadata from any file type.
529
530    Args:
531        file_path (str): Path to target file.
532
533    Returns:
534        Dict[str, Any]: Nested metadata structure.
535    """
536    path = clean_path(file_path)
537    if not path:
538        return {"error": "File not found"}
539    meta: Dict[str, Any] = {}
540    try:
541        stats = os.stat(path)
542        meta["size_bytes"] = stats.st_size
543        meta["mime"] = magic.from_file(path, mime=True)
544        meta["hashes"] = {
545            "md5": hashlib.md5(open(path,'rb').read()).hexdigest()}
546    except Exception as e:
547        meta["error"] = str(e)
548    return meta

Extract comprehensive metadata from any file type.

Args: file_path (str): Path to target file.

Returns: Dict[str, Any]: Nested metadata structure.

def text_from_url(url: str) -> Optional[str]:
152def text_from_url(
153    url: str
154) -> Optional[str]:
155    """
156    Fetch and extract visible text from a web page.
157
158    Args:
159        url (str): The target webpage URL.
160
161    Returns:
162        Optional[str]: Extracted text, or None on failure.
163    """
164    try:
165        response = requests.get(url, timeout=10)
166        response.raise_for_status()
167        soup = BeautifulSoup(response.text, "html.parser")
168        for tag in soup(
169            ["script", "style", "noscript", "iframe",
170             "header", "footer", "meta", "link"]
171        ):
172            tag.decompose()
173        content = soup.get_text(separator=" ").strip()
174        return normalize(content) 
175    except requests.RequestException as e:
176        print(f"Error fetching URL: {url} - {e}")
177        return None

Fetch and extract visible text from a web page.

Args: url (str): The target webpage URL.

Returns: Optional[str]: Extracted text, or None on failure.

def text_from_audio(audio_file: str) -> Optional[str]:
241def text_from_audio(
242    audio_file: str
243) -> Optional[str]:
244    """
245    Transcribe audio to text via Google Speech Recognition.
246
247    Args:
248        audio_file (str): Path to the audio file.
249
250    Returns:
251        Optional[str]: Transcription, or None on error.
252    """
253    def audio_to_wav(
254        file_path: str
255    ) -> str:
256        _, ext = os.path.splitext(file_path)
257        ext = ext.lstrip('.')
258        audio = AudioSegment.from_file(file_path, format=ext)
259        wav_path = file_path.replace(f".{ext}", ".wav")
260        audio.export(wav_path, format='wav')
261        return wav_path
262
263    _, ext = os.path.splitext(audio_file)
264    if ext.lower() not in ['.wav', '.wave']:
265        audio_file = audio_to_wav(audio_file)
266    recognizer = sr.Recognizer()
267    try:
268        with sr.AudioFile(audio_file) as source:
269            audio = recognizer.record(source)
270        return recognizer.recognize_google(audio)
271    except sr.UnknownValueError:
272        print("Could not understand audio")
273    except sr.RequestError as e:
274        print(f"Speech service error: {e}")
275    return None

Transcribe audio to text via Google Speech Recognition.

Args: audio_file (str): Path to the audio file.

Returns: Optional[str]: Transcription, or None on error.

def text_from_pdf(pdf_path: str) -> Optional[str]:
322def text_from_pdf(
323    pdf_path: str
324) -> Optional[str]:
325    """
326    Extract text and image OCR from a PDF using PyMuPDF.
327
328    Args:
329        pdf_path (str): Path to PDF file.
330
331    Returns:
332        Optional[str]: Combined text and OCR results, or None on error.
333    """
334    plain_text = ""
335    try:
336        doc = pymupdf.open(pdf_path)
337        # metadata
338        for k, v in doc.metadata.items():
339            plain_text += f"{k}: {v}\n"
340        for i in range(len(doc)):
341            page = doc.load_page(i)
342            plain_text += f"\n--- Page {i+1} ---\n"
343            txt = page.get_text()
344            plain_text += txt or "[No text]\n"
345            for img_index, img in enumerate(page.get_images(full=True), start=1):
346                xref = img[0]
347                base = doc.extract_image(xref)
348                img_bytes = base["image"]
349                img_path = f"/tmp/page{i+1}-img{img_index}.png"
350                with open(img_path, "wb") as img_file:
351                    img_file.write(img_bytes)
352                ocr = text_from_image(img_path) or ""
353                plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n"
354        doc.close()
355        return normalize(plain_text)
356    except Exception as e:
357        print(f"Error processing PDF: {e}")
358        return None

Extract text and image OCR from a PDF using PyMuPDF.

Args: pdf_path (str): Path to PDF file.

Returns: Optional[str]: Combined text and OCR results, or None on error.

def text_from_doc(filepath: str, min_length: int = 4) -> str:
361def text_from_doc(
362    filepath: str,
363    min_length: int = 4
364) -> str:
365    """
366    Extract readable strings and metadata from binary Word (.doc) files.
367
368    Args:
369        filepath (str): Path to .doc file.
370        min_length (int): Minimum string length to extract.
371
372    Returns:
373        str: Metadata and text content.
374    """
375    def extract_printable_strings(
376        data: bytes
377    ) -> List[str]:
378        pattern = re.compile(
379            b'[' + re.escape(bytes(string.printable, 'ascii')) +
380            b']{%d,}' % min_length
381        )
382        found = pattern.findall(data)
383        return list(dict.fromkeys(m.decode(errors='ignore').strip()
384                                   for m in found))
385
386    def clean_strings(
387        strs: List[str]
388    ) -> List[str]:
389        cleaned: List[str] = []
390        skip = ["HYPERLINK", "OLE2", "Normal.dotm"]
391        for line in strs:
392            if any(line.startswith(pref) for pref in skip):
393                continue
394            cleaned.append(re.sub(r'\s+', ' ', line).strip())
395        return cleaned
396
397    with open(filepath, 'rb') as f:
398        data = f.read()
399    strings = extract_printable_strings(data)
400    strings = clean_strings(strings)
401    content = "\n".join(strings)
402    return normalize(content)

Extract readable strings and metadata from binary Word (.doc) files.

Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.

Returns: str: Metadata and text content.

def text_from_docx(file_path: str) -> Optional[str]:
405def text_from_docx(
406    file_path: str
407) -> Optional[str]:
408    """
409    Extract text, tables, and OCR images from a DOCX file.
410
411    Args:
412        file_path (str): Path to the .docx file.
413
414    Returns:
415        Optional[str]: Combined document text, or None on error.
416    """
417    path = clean_path(file_path)
418    if not path:
419        return None
420    doc = Document(path)
421    plain_text = ""
422    try:
423        for p in doc.paragraphs:
424            if p.text.strip():
425                plain_text += p.text.strip() + "\n"
426        for tbl in doc.tables:
427            plain_text += "\n[Table]\n"
428            for row in tbl.rows:
429                plain_text += "\t".join(c.text.strip()
430                                        for c in row.cells) + "\n"
431        for rel in doc.part.rels:
432            if "image" in doc.part.rels[rel].target_ref:
433                blob = doc.part.rels[rel].target_part.blob
434                img_path = f"/tmp/docx_img_{rel}.png"
435                with open(img_path, "wb") as img_f:
436                    img_f.write(blob)
437                ocr = text_from_image(img_path) or ""
438                plain_text += f"\n[Image OCR]\n{ocr}\n"
439        return normalize(plain_text)
440    except Exception as e:
441        print(f"Error processing DOCX: {e}")
442        return None

Extract text, tables, and OCR images from a DOCX file.

Args: file_path (str): Path to the .docx file.

Returns: Optional[str]: Combined document text, or None on error.

def text_from_excel(file_path: str) -> str:
445def text_from_excel(
446    file_path: str
447) -> str:
448    """
449    Convert an Excel workbook to CSV text.
450
451    Args:
452        file_path (str): Path to the Excel file.
453
454    Returns:
455        str: CSV-formatted string.
456    """
457    path = clean_path(file_path)
458    if not path:
459        return ""
460    try:
461        df = pd.read_excel(path)
462        out = StringIO()
463        df.to_csv(out, index=False)
464        return out.getvalue()
465    except Exception as e:
466        print(f"Failed Excel -> CSV: {e}")
467        return ""

Convert an Excel workbook to CSV text.

Args: file_path (str): Path to the Excel file.

Returns: str: CSV-formatted string.

def text_from_image(file_path: str) -> Optional[str]:
470def text_from_image(
471    file_path: str
472) -> Optional[str]:
473    """
474    Perform OCR on an image file.
475
476    Args:
477        file_path (str): Path to the image.
478
479    Returns:
480        Optional[str]: Extracted text, or None on error.
481    """
482    path = clean_path(file_path)
483    if not path:
484        return None
485    try:
486        with Image.open(path) as img:
487            txt = pytesseract.image_to_string(img).strip()
488            return normalize(txt) or ""
489    except Exception as e:
490        print(f"Failed image OCR: {e}")
491        return None

Perform OCR on an image file.

Args: file_path (str): Path to the image.

Returns: Optional[str]: Extracted text, or None on error.

def text_from_any(file_path: str) -> Optional[str]:
494def text_from_any(
495    file_path: str
496) -> Optional[str]:
497    """
498    Handle unknown file types by reporting stats and metadata.
499
500    Args:
501        file_path (str): Path to the file.
502
503    Returns:
504        Optional[str]: Plain-text report, or None on error.
505    """
506    path = clean_path(file_path)
507    if not path:
508        return None
509    try:
510        stats = os.stat(path)
511        info = {
512            "path": path,
513            "size": stats.st_size,
514            "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
515            "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
516        }
517        content = "\n".join(f"{k}: {v}" for k, v in info.items())
518        return normalize(content)
519    except Exception as e:
520        print(f"Error on other file: {e}")
521        return None

Handle unknown file types by reporting stats and metadata.

Args: file_path (str): Path to the file.

Returns: Optional[str]: Plain-text report, or None on error.