mrblack

 1#!/usr/bin/env python3
 2# -*- coding: utf-8 -*-
 3#
 4# File: __init__.py
 5# Author: Wadih Khairallah
 6# Description: 
 7# Created: 2025-05-12 16:47:22
 8# Modified: 2025-05-12 17:35:28
 9
10from mrblack.pii import extract as extract_pii
11from mrblack.textextract import (
12    extract_text,
13    extract_exif,
14    extract_metadata,
15    text_from_url,
16    text_from_audio,
17    text_from_pdf,
18    text_from_doc,
19    text_from_docx,
20    text_from_excel,
21    text_from_image,
22    text_from_any
23)
24
25__all__ = [
26    "extract_pii",
27    "extract_text",
28    "extract_exif",
29    "extract_metadata",
30    "text_from_url",
31    "text_from_audio",
32    "text_from_pdf",
33    "text_from_doc",
34    "text_from_docx",
35    "text_from_excel",
36    "text_from_image",
37    "text_from_any"
38]
def extract_pii( text: str, labels: Union[List[str], str, NoneType] = None) -> Dict[str, List[str]]:
 73def extract(
 74    text: str,
 75    labels: Optional[Union[List[str], str]] = None
 76) -> Dict[str, List[str]]:
 77    """
 78    Extract PII matches from provided text.
 79
 80    Args:
 81        text (str): The input text to scan for patterns.
 82        labels (Optional[Union[List[str], str]]): Specific labels to filter on.
 83
 84    Returns:
 85        Dict[str, List[str]]: Mapping of each label to a sorted list of
 86        matched and cleaned strings.
 87    """
 88    if isinstance(labels, str):
 89        labels = [labels]
 90    patterns = PATTERNS
 91    if labels:
 92        patterns = [
 93            p for p in PATTERNS
 94            if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels)
 95        ]
 96    results: Dict[str, set] = defaultdict(set)
 97    for pattern in patterns:
 98        try:
 99            rx = re.compile(pattern)
100            for m in rx.finditer(text):
101                for lbl, val in m.groupdict().items():
102                    if not val:
103                        continue
104                    cleaned = _clean_value(lbl, val)
105                    if lbl == "url":
106                        cleaned = cleaned.rstrip("),.**")
107                    if cleaned is not None:
108                        results[lbl].add(cleaned)
109        except re.error as e:
110            print(
111                f"Invalid regex skipped: {pattern}{e}",
112                file=sys.stderr
113            )
114    return {lbl: sorted(vals) for lbl, vals in results.items()}

Extract PII matches from provided text.

Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.

Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.

def extract_text(file_path: str) -> Optional[str]:
152def extract_text(
153    file_path: str
154) -> Optional[str]:
155    """
156    Extract text content from a file based on MIME type.
157
158    Supports text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
159
160    Args:
161        file_path (str): Path to the input file.
162
163    Returns:
164        Optional[str]: Extracted text, or None if unsupported or error.
165    """
166    TEXT_MIME_TYPES = {
167        # programming, config, data types...
168        "application/json", "application/xml", "application/x-yaml",
169        "application/x-toml", "application/x-csv", "application/x-markdown",
170        # add others as needed
171    }
172
173    path = clean_path(file_path)
174    if not path:
175        print(f"No such file: {file_path}")
176        return None
177
178    mime_type = magic.from_file(path, mime=True)
179    try:
180        if mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES:
181            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
182                content = f.read()
183        elif mime_type in [
184            "application/vnd.ms-excel",
185            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
186        ]:
187            content = text_from_excel(path)
188        elif mime_type == "application/pdf":
189            content = text_from_pdf(path)
190        elif mime_type == \
191            "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
192            content = text_from_docx(path)
193        elif mime_type == "application/msword":
194            content = text_from_doc(path)  # legacy .doc
195        elif mime_type.startswith("image/"):
196            content = text_from_image(path)
197        elif mime_type.startswith("audio/"):
198            content = text_from_audio(path)
199        else:
200            content = text_from_any(path)
201
202        if content:
203            return content
204        else:
205            print(f"No content found for file: {path}")
206            return None
207    except Exception as e:
208        print(f"Error reading {path}: {e}")
209        return None

Extract text content from a file based on MIME type.

Supports text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.

Args: file_path (str): Path to the input file.

Returns: Optional[str]: Extracted text, or None if unsupported or error.

def extract_exif(file_path: str) -> Optional[Dict[str, Any]]:
 99def extract_exif(
100    file_path: str
101) -> Optional[Dict[str, Any]]:
102    """
103    Extract EXIF metadata from a file using exiftool.
104
105    Args:
106        file_path (str): Path to the target file.
107
108    Returns:
109        Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
110    """
111    exif_data: Optional[Dict[str, Any]] = None
112    try:
113        result = subprocess.run(
114            ['exiftool', '-j', file_path],
115            stdout=subprocess.PIPE,
116            stderr=subprocess.PIPE
117        )
118        if result.returncode == 0:
119            exif_data = json.loads(result.stdout.decode())[0]
120    except Exception as e:
121        print(f"Exiftool failed: {e}")
122    return exif_data

Extract EXIF metadata from a file using exiftool.

Args: file_path (str): Path to the target file.

Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.

def extract_metadata(file_path: str) -> Dict[str, Any]:
493def extract_metadata(
494    file_path: str
495) -> Dict[str, Any]:
496    """
497    Extract comprehensive metadata from any file type.
498
499    Args:
500        file_path (str): Path to target file.
501
502    Returns:
503        Dict[str, Any]: Nested metadata structure.
504    """
505    path = clean_path(file_path)
506    if not path:
507        return {"error": "File not found"}
508    meta: Dict[str, Any] = {}
509    try:
510        stats = os.stat(path)
511        meta["size_bytes"] = stats.st_size
512        meta["mime"] = magic.from_file(path, mime=True)
513        meta["hashes"] = {
514            "md5": hashlib.md5(open(path,'rb').read()).hexdigest()}
515    except Exception as e:
516        meta["error"] = str(e)
517    return meta

Extract comprehensive metadata from any file type.

Args: file_path (str): Path to target file.

Returns: Dict[str, Any]: Nested metadata structure.

def text_from_url(url: str) -> Optional[str]:
125def text_from_url(
126    url: str
127) -> Optional[str]:
128    """
129    Fetch and extract visible text from a web page.
130
131    Args:
132        url (str): The target webpage URL.
133
134    Returns:
135        Optional[str]: Extracted text, or None on failure.
136    """
137    try:
138        response = requests.get(url, timeout=10)
139        response.raise_for_status()
140        soup = BeautifulSoup(response.text, "html.parser")
141        for tag in soup(
142            ["script", "style", "noscript", "iframe",
143             "header", "footer", "meta", "link"]
144        ):
145            tag.decompose()
146        return soup.get_text(separator=" ").strip()
147    except requests.RequestException as e:
148        print(f"Error fetching URL: {url} - {e}")
149        return None

Fetch and extract visible text from a web page.

Args: url (str): The target webpage URL.

Returns: Optional[str]: Extracted text, or None on failure.

def text_from_audio(audio_file: str) -> Optional[str]:
212def text_from_audio(
213    audio_file: str
214) -> Optional[str]:
215    """
216    Transcribe audio to text via Google Speech Recognition.
217
218    Args:
219        audio_file (str): Path to the audio file.
220
221    Returns:
222        Optional[str]: Transcription, or None on error.
223    """
224    def audio_to_wav(
225        file_path: str
226    ) -> str:
227        _, ext = os.path.splitext(file_path)
228        ext = ext.lstrip('.')
229        audio = AudioSegment.from_file(file_path, format=ext)
230        wav_path = file_path.replace(f".{ext}", ".wav")
231        audio.export(wav_path, format='wav')
232        return wav_path
233
234    _, ext = os.path.splitext(audio_file)
235    if ext.lower() not in ['.wav', '.wave']:
236        audio_file = audio_to_wav(audio_file)
237    recognizer = sr.Recognizer()
238    try:
239        with sr.AudioFile(audio_file) as source:
240            audio = recognizer.record(source)
241        return recognizer.recognize_google(audio)
242    except sr.UnknownValueError:
243        print("Could not understand audio")
244    except sr.RequestError as e:
245        print(f"Speech service error: {e}")
246    return None

Transcribe audio to text via Google Speech Recognition.

Args: audio_file (str): Path to the audio file.

Returns: Optional[str]: Transcription, or None on error.

def text_from_pdf(pdf_path: str) -> Optional[str]:
293def text_from_pdf(
294    pdf_path: str
295) -> Optional[str]:
296    """
297    Extract text and image OCR from a PDF using PyMuPDF.
298
299    Args:
300        pdf_path (str): Path to PDF file.
301
302    Returns:
303        Optional[str]: Combined text and OCR results, or None on error.
304    """
305    plain_text = ""
306    try:
307        doc = fitz.open(pdf_path)
308        # metadata
309        for k, v in doc.metadata.items():
310            plain_text += f"{k}: {v}\n"
311        for i in range(len(doc)):
312            page = doc.load_page(i)
313            plain_text += f"\n--- Page {i+1} ---\n"
314            txt = page.get_text()
315            plain_text += txt or "[No text]\n"
316            for img_index, img in enumerate(page.get_images(full=True), start=1):
317                xref = img[0]
318                base = doc.extract_image(xref)
319                img_bytes = base["image"]
320                img_path = f"/tmp/page{i+1}-img{img_index}.png"
321                with open(img_path, "wb") as img_file:
322                    img_file.write(img_bytes)
323                ocr = text_from_image(img_path) or ""
324                plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n"
325        doc.close()
326        return plain_text
327    except Exception as e:
328        print(f"Error processing PDF: {e}")
329        return None

Extract text and image OCR from a PDF using PyMuPDF.

Args: pdf_path (str): Path to PDF file.

Returns: Optional[str]: Combined text and OCR results, or None on error.

def text_from_doc(filepath: str, min_length: int = 4) -> str:
332def text_from_doc(
333    filepath: str,
334    min_length: int = 4
335) -> str:
336    """
337    Extract readable strings and metadata from binary Word (.doc) files.
338
339    Args:
340        filepath (str): Path to .doc file.
341        min_length (int): Minimum string length to extract.
342
343    Returns:
344        str: Metadata and text content.
345    """
346    def extract_printable_strings(
347        data: bytes
348    ) -> List[str]:
349        pattern = re.compile(
350            b'[' + re.escape(bytes(string.printable, 'ascii')) +
351            b']{%d,}' % min_length
352        )
353        found = pattern.findall(data)
354        return list(dict.fromkeys(m.decode(errors='ignore').strip()
355                                   for m in found))
356
357    def clean_strings(
358        strs: List[str]
359    ) -> List[str]:
360        cleaned: List[str] = []
361        skip = ["HYPERLINK", "OLE2", "Normal.dotm"]
362        for line in strs:
363            if any(line.startswith(pref) for pref in skip):
364                continue
365            cleaned.append(re.sub(r'\s+', ' ', line).strip())
366        return cleaned
367
368    with open(filepath, 'rb') as f:
369        data = f.read()
370    strings = extract_printable_strings(data)
371    strings = clean_strings(strings)
372    return "\n".join(strings)

Extract readable strings and metadata from binary Word (.doc) files.

Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.

Returns: str: Metadata and text content.

def text_from_docx(file_path: str) -> Optional[str]:
375def text_from_docx(
376    file_path: str
377) -> Optional[str]:
378    """
379    Extract text, tables, and OCR images from a DOCX file.
380
381    Args:
382        file_path (str): Path to the .docx file.
383
384    Returns:
385        Optional[str]: Combined document text, or None on error.
386    """
387    path = clean_path(file_path)
388    if not path:
389        return None
390    doc = Document(path)
391    plain_text = ""
392    try:
393        for p in doc.paragraphs:
394            if p.text.strip():
395                plain_text += p.text.strip() + "\n"
396        for tbl in doc.tables:
397            plain_text += "\n[Table]\n"
398            for row in tbl.rows:
399                plain_text += "\t".join(c.text.strip()
400                                        for c in row.cells) + "\n"
401        for rel in doc.part.rels:
402            if "image" in doc.part.rels[rel].target_ref:
403                blob = doc.part.rels[rel].target_part.blob
404                img_path = f"/tmp/docx_img_{rel}.png"
405                with open(img_path, "wb") as img_f:
406                    img_f.write(blob)
407                ocr = text_from_image(img_path) or ""
408                plain_text += f"\n[Image OCR]\n{ocr}\n"
409        return plain_text
410    except Exception as e:
411        print(f"Error processing DOCX: {e}")
412        return None

Extract text, tables, and OCR images from a DOCX file.

Args: file_path (str): Path to the .docx file.

Returns: Optional[str]: Combined document text, or None on error.

def text_from_excel(file_path: str) -> str:
415def text_from_excel(
416    file_path: str
417) -> str:
418    """
419    Convert an Excel workbook to CSV text.
420
421    Args:
422        file_path (str): Path to the Excel file.
423
424    Returns:
425        str: CSV-formatted string.
426    """
427    path = clean_path(file_path)
428    if not path:
429        return ""
430    try:
431        df = pd.read_excel(path)
432        out = StringIO()
433        df.to_csv(out, index=False)
434        return out.getvalue()
435    except Exception as e:
436        print(f"Failed Excel -> CSV: {e}")
437        return ""

Convert an Excel workbook to CSV text.

Args: file_path (str): Path to the Excel file.

Returns: str: CSV-formatted string.

def text_from_image(file_path: str) -> Optional[str]:
440def text_from_image(
441    file_path: str
442) -> Optional[str]:
443    """
444    Perform OCR on an image file.
445
446    Args:
447        file_path (str): Path to the image.
448
449    Returns:
450        Optional[str]: Extracted text, or None on error.
451    """
452    path = clean_path(file_path)
453    if not path:
454        return None
455    try:
456        with Image.open(path) as img:
457            txt = pytesseract.image_to_string(img).strip()
458            return txt or ""
459    except Exception as e:
460        print(f"Failed image OCR: {e}")
461        return None

Perform OCR on an image file.

Args: file_path (str): Path to the image.

Returns: Optional[str]: Extracted text, or None on error.

def text_from_any(file_path: str) -> Optional[str]:
464def text_from_any(
465    file_path: str
466) -> Optional[str]:
467    """
468    Handle unknown file types by reporting stats and metadata.
469
470    Args:
471        file_path (str): Path to the file.
472
473    Returns:
474        Optional[str]: Plain-text report, or None on error.
475    """
476    path = clean_path(file_path)
477    if not path:
478        return None
479    try:
480        stats = os.stat(path)
481        info = {
482            "path": path,
483            "size": stats.st_size,
484            "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
485            "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
486        }
487        return "\n".join(f"{k}: {v}" for k, v in info.items())
488    except Exception as e:
489        print(f"Error on other file: {e}")
490        return None

Handle unknown file types by reporting stats and metadata.

Args: file_path (str): Path to the file.

Returns: Optional[str]: Plain-text report, or None on error.