mrblack

 1#!/usr/bin/env python3
 2# -*- coding: utf-8 -*-
 3#
 4# File: __init__.py
 5# Author: Wadih Khairallah
 6# Description: 
 7# Created: 2025-05-12 16:47:22
 8# Modified: 2025-05-16 16:38:12
 9
10from .pii import (
11    extract_pii_text,
12    extract_pii_file,
13    extract_pii_url,
14    extract_pii_image,
15    extract_pii_screenshot
16)
17from .textextract import (
18    extract_text,
19    extract_text_with_password,
20    extract_exif,
21    extract_strings,
22    extract_metadata,
23    text_from_screenshot,
24    text_from_url,
25    text_from_html,
26    text_from_audio,
27    text_from_pdf,
28    text_from_doc,
29    text_from_docx,
30    text_from_excel,
31    text_from_image,
32    text_from_any,
33    text_from_odt,
34    text_from_pptx,
35    text_from_epub,
36    analyze_text,
37    summarize_text,
38    translate_text,
39    list_available_languages,
40    detect_language,
41    scrape_website,
42    normalize_text,
43
44)
45
46__all__ = [
47    "extract_pii_text",
48    "extract_pii_file",
49    "extract_pii_url",
50    "extract_pii_image",
51    "extract_pii_screenshot",
52    "extract_text_with_password",
53    "extract_text",
54    "extract_exif",
55    "extract_metadata",
56    "extract_strings",
57    "text_from_screenshot",
58    "text_from_url",
59    "text_from_html",
60    "text_from_audio",
61    "text_from_pdf",
62    "text_from_doc",
63    "text_from_docx",
64    "text_from_excel",
65    "text_from_image",
66    "text_from_any",
67    "text_from_odt",
68    "text_from_pptx",
69    "text_from_epub",
70    "analyze_text",
71    "summarize_text",
72    "translate_text",
73    "list_available_languages",
74    "detect_language",
75    "scrape_website",
76    "normalize_text"
77]
def extract_pii_text( text: str, labels: Union[List[str], str, NoneType] = None) -> Dict[str, List[str]]:
 71def extract_pii_text(
 72    text: str,
 73    labels: Optional[Union[List[str], str]] = None
 74) -> Dict[str, List[str]]:
 75    """
 76    Extract PII matches from provided text.
 77
 78    Args:
 79        text (str): The input text to scan for patterns.
 80        labels (Optional[Union[List[str], str]]): Specific labels to filter on.
 81
 82    Returns:
 83        Dict[str, List[str]]: Mapping of each label to a sorted list of
 84        matched and cleaned strings.
 85    """
 86    if isinstance(labels, str):
 87        labels = [labels]
 88    patterns = PATTERNS
 89    if labels:
 90        patterns = [
 91            p for p in PATTERNS
 92            if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels)
 93        ]
 94    results: Dict[str, set] = defaultdict(set)
 95    for pattern in patterns:
 96        try:
 97            rx = re.compile(pattern)
 98            for m in rx.finditer(text):
 99                for lbl, val in m.groupdict().items():
100                    if not val:
101                        continue
102                    cleaned = _clean_value(lbl, val)
103                    if lbl == "url":
104                        cleaned = cleaned.rstrip("),.**")
105                    if cleaned is not None:
106                        results[lbl].add(cleaned)
107        except re.error as e:
108            print(
109                f"Invalid regex skipped: {pattern}{e}",
110                file=sys.stderr
111            )
112    return {lbl: sorted(vals) for lbl, vals in results.items()}

Extract PII matches from provided text.

Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.

Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.

def extract_pii_file( file_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
115def extract_pii_file(
116    file_path: str,
117    labels: Optional[Union[List[str], str]] = None
118) -> Optional[Dict[str, List[str]]]:
119    """
120    Extract PII from a single file's text content.
121
122    Args:
123        file_path (str): Path to the file.
124        labels (Optional[Union[List[str], str]]): Labels to filter.
125
126    Returns:
127        Optional[Dict[str, List[str]]]: Extraction results, or None.
128    """
129    text = extract_text(file_path)
130    if not text:
131        return None
132    data = extract_pii_text(text, labels)
133    return data or None

Extract PII from a single file's text content.

Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_url( path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
136def extract_pii_url(
137    path: str,
138    labels: Optional[Union[List[str], str]] = None
139) -> Optional[Dict[str, List[str]]]:
140    """
141    Extract PII from the text at a URL.
142
143    Args:
144        path (str): The URL to fetch.
145        labels (Optional[Union[List[str], str]]): Labels to filter.
146
147    Returns:
148        Optional[Dict[str, List[str]]]: Extraction results, or None.
149    """
150    text = text_from_url(path)
151    if not text:
152        return None
153    data = extract_pii_text(text, labels)
154    return data or None

Extract PII from the text at a URL.

Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_image( image_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
157def extract_pii_image(
158    image_path: str,
159    labels: Optional[Union[List[str], str]] = None
160) -> Optional[Dict[str, List[str]]]:
161    """
162    Extract PII from an image using OCR.
163
164    Args:
165        image_path (str): Path to the image file.
166        labels (Optional[Union[List[str], str]]): Labels to filter.
167
168    Returns:
169        Optional[Dict[str, List[str]]]: Extraction results, or None.
170    """
171    path = clean_path(image_path)
172    if not path or not os.path.isfile(path):
173        print(f"[red]Invalid image path:[/] {image_path}")
174        return None
175    text = extract_text(path)
176    if not text:
177        return None
178    data = extract_pii_text(text, labels)
179    return data or None

Extract PII from an image using OCR.

Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_screenshot( labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
182def extract_pii_screenshot(
183    labels: Optional[Union[List[str], str]] = None
184) -> Optional[Dict[str, List[str]]]:
185    """
186    Capture a screenshot and extract PII from its OCR text.
187
188    Args:
189        labels (Optional[Union[List[str], str]]): Labels to filter.
190
191    Returns:
192        Optional[Dict[str, List[str]]]: Extraction results, or None.
193    """
194    text = text_from_screenshot()
195    if not text:
196        return None
197    data = extract_pii_text(text, labels)
198    return data or None

Capture a screenshot and extract PII from its OCR text.

Args: labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_text_with_password(file_path: str, password: str) -> Optional[str]:
503def extract_text_with_password(file_path: str, password: str) -> Optional[str]:
504    """
505    Extract text from password-protected files.
506    
507    Args:
508        file_path (str): Path to the file
509        password (str): Password to unlock the file
510        
511    Returns:
512        Optional[str]: Extracted text
513    """
514    file_ext = os.path.splitext(file_path)[1].lower()
515    
516    if file_ext == '.pdf':
517        return text_from_pdf_protected(file_path, password)
518    elif file_ext in ['.docx', '.xlsx', '.pptx']:
519        return text_from_office_protected(file_path, password)
520    else:
521        logger.warning(f"Password protection not supported for {file_ext} files")
522        return None

Extract text from password-protected files.

Args: file_path (str): Path to the file password (str): Password to unlock the file

Returns: Optional[str]: Extracted text

def extract_text(file_path: str) -> Optional[str]:
417def extract_text(
418    file_path: str
419) -> Optional[str]:
420    """
421    Extract text content from a local file or URL.
422
423    Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
424
425    Args:
426        file_path (str): Path to the input file or URL.
427
428    Returns:
429        Optional[str]: Extracted text, or None if unsupported or error.
430    """
431    if is_url(file_path):
432        return text_from_url(file_path)
433
434    TEXT_MIME_TYPES = {
435        "application/json", "application/xml", "application/x-yaml",
436        "application/x-toml", "application/x-csv", "application/x-markdown",
437    }
438
439    path = clean_path(file_path)
440    if not path:
441        logger.error(f"No such file: {file_path}")
442        return None
443
444    mime_type = magic.from_file(path, mime=True)
445    try:
446        if mime_type.startswith("text/html"):
447            content = text_from_html(path)
448            return content
449
450        elif mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES:
451            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
452                content = f.read()
453            return normalize_text(content)
454
455        elif mime_type in [
456            "application/vnd.ms-excel",
457            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
458        ]:
459            content = text_from_excel(path)
460            return content
461
462        elif mime_type == "application/pdf":
463            content = text_from_pdf(path)
464            return content
465
466        elif mime_type == \
467            "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
468            content = text_from_docx(path)
469            return content
470
471        elif mime_type == "application/msword":
472            content = text_from_doc(path)
473            return content
474
475        elif mime_type.startswith("image/"):
476            content = text_from_image(path)
477            return content
478
479        elif mime_type.startswith("audio/"):
480            content = text_from_audio(path)
481            return content
482
483        elif mime_type == "application/epub+zip":
484            content = text_from_epub(path)
485            return content
486
487        elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
488            content = text_from_pptx(path)
489            return content
490
491        elif mime_type == "application/vnd.oasis.opendocument.text":
492            content = text_from_odt(path)
493            return content
494
495        else:
496            content = text_from_any(path)
497            return content
498    except Exception as e:
499        logger.error(f"Error reading {path}: {e}")
500        return None

Extract text content from a local file or URL.

Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.

Args: file_path (str): Path to the input file or URL.

Returns: Optional[str]: Extracted text, or None if unsupported or error.

def extract_exif(file_path: str) -> Optional[Dict[str, Any]]:
198def extract_exif(
199    file_path: str
200) -> Optional[Dict[str, Any]]:
201    """
202    Extract EXIF metadata from a file using exiftool.
203
204    Args:
205        file_path (str): Path to the target file.
206
207    Returns:
208        Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
209    """
210    exif_data: Optional[Dict[str, Any]] = None
211    try:
212        result = subprocess.run(
213            ['exiftool', '-j', file_path],
214            stdout=subprocess.PIPE,
215            stderr=subprocess.PIPE
216        )
217        if result.returncode == 0:
218            exif_data = json.loads(result.stdout.decode())[0]
219    except Exception as e:
220        logger.error(f"Exiftool failed: {e}")
221    return exif_data

Extract EXIF metadata from a file using exiftool.

Args: file_path (str): Path to the target file.

Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.

def extract_metadata(file_path: str) -> Dict[str, Any]:
1178def extract_metadata(
1179    file_path: str
1180) -> Dict[str, Any]:
1181    """
1182    Extract comprehensive metadata from any file type.
1183
1184    Args:
1185        file_path (str): Path to target file.
1186
1187    Returns:
1188        Dict[str, Any]: Nested metadata structure.
1189    """
1190    path = clean_path(file_path)
1191    if not path:
1192        return {"error": "File not found"}
1193    
1194    meta: Dict[str, Any] = {}
1195    try:
1196        stats = os.stat(path)
1197        meta["size_bytes"] = stats.st_size
1198        meta["created"] = datetime.fromtimestamp(stats.st_ctime).isoformat()
1199        meta["modified"] = datetime.fromtimestamp(stats.st_mtime).isoformat()
1200        meta["mime"] = magic.from_file(path, mime=True)
1201        
1202        # Calculate multiple hash types
1203        with open(path, 'rb') as f:
1204            content = f.read()
1205            meta["hashes"] = {
1206                "md5": hashlib.md5(content).hexdigest(),
1207                "sha1": hashlib.sha1(content).hexdigest(),
1208                "sha256": hashlib.sha256(content).hexdigest()
1209            }
1210        
1211        # Get extended file attributes where supported
1212        if hasattr(os, 'listxattr'):
1213            try:
1214                xattrs = os.listxattr(path)
1215                if xattrs:
1216                    meta["xattrs"] = {}
1217                    for attr in xattrs:
1218                        meta["xattrs"][attr] = os.getxattr(path, attr)
1219            except (OSError, AttributeError):
1220                pass
1221        
1222        # Get EXIF data if available and relevant
1223        exif = extract_exif(path)
1224        if exif:
1225            meta["exif"] = exif
1226            
1227        # Get file owner and permissions
1228        try:
1229            meta["owner"] = pwd.getpwuid(stats.st_uid).pw_name
1230        except KeyError:
1231            meta["owner"] = str(stats.st_uid)
1232        meta["permissions"] = oct(stats.st_mode)[-3:]
1233            
1234    except Exception as e:
1235        meta["error"] = str(e)
1236        
1237    return meta

Extract comprehensive metadata from any file type.

Args: file_path (str): Path to target file.

Returns: Dict[str, Any]: Nested metadata structure.

def extract_strings(file_path, min_length=4):
1008def extract_strings(file_path, min_length=4):
1009    """
1010    Extract printable strings from a file, similar to the Unix 'strings' command.
1011    
1012    Args:
1013        file_path (str): Path to the file to extract strings from
1014        min_length (int, optional): Minimum length of strings to extract. Defaults to 4.
1015        
1016    Returns:
1017        list: List of printable strings found in the file
1018    """
1019    import string
1020    file_path = clean_path(file_path)
1021
1022    
1023    # Define printable characters (excluding tabs and newlines)
1024    printable_chars = set(string.printable) - set('\t\n\r\v\f')
1025    
1026    result = []
1027    current_string = ""
1028    
1029    # Read the file in binary mode
1030    try:
1031        with open(file_path, 'rb') as file:
1032            # Read the file byte by byte
1033            for byte in file.read():
1034                # Convert byte to character
1035                char = chr(byte)
1036                
1037                # If character is printable, add to current string
1038                if char in printable_chars:
1039                    current_string += char
1040                # If not printable and we have a string of minimum length, add to results
1041                elif len(current_string) >= min_length:
1042                    if current_string == "Sj[d":
1043                        pass
1044                    else:
1045                        result.append(current_string)
1046                    current_string = ""
1047                # If not printable and current string is too short, reset current string
1048                else:
1049                    current_string = ""
1050        
1051        # Don't forget to add the last string if it meets the minimum length
1052        if len(current_string) >= min_length:
1053            result.append(current_string)
1054        
1055        return result
1056    except FileNotFoundError:
1057        print(f"Error: File '{file_path}' not found.", file=sys.stderr)
1058        return None
1059    except Exception as e:
1060        print(f"Error: {e}", file=sys.stderr)
1061        return None

Extract printable strings from a file, similar to the Unix 'strings' command.

Args: file_path (str): Path to the file to extract strings from min_length (int, optional): Minimum length of strings to extract. Defaults to 4.

Returns: list: List of printable strings found in the file

def text_from_screenshot() -> str:
164def text_from_screenshot() -> str:
165    """
166    Capture a full-screen screenshot, perform OCR, and clean up temp file.
167
168    Returns:
169        str: Normalized OCR-extracted text from the screenshot.
170    """
171    tmp_filename = f"screenshot_{uuid4().hex}.png"
172    tmp_path = os.path.join(tempfile.gettempdir(), tmp_filename)
173
174    try:
175        with mss() as sct:
176            monitor = {"top": 0, "left": 0, "width": 0, "height": 0}
177            for mon in sct.monitors:
178                monitor["left"] = min(mon["left"], monitor["left"])
179                monitor["top"] = min(mon["top"], monitor["top"])
180                monitor["width"] = max(mon["width"] + mon["left"] - monitor["left"], monitor["width"])
181                monitor["height"] = max(mon["height"] + mon["top"] - monitor["top"], monitor["height"])
182            screenshot = sct.grab(monitor)
183
184        img = Image.frombytes("RGB", screenshot.size, screenshot.bgra, "raw", "BGRX")
185        img_gray = img.convert("L")
186        img_gray.save(tmp_path)
187
188        content = text_from_image(tmp_path)
189        return normalize_text(content)
190    finally:
191        if os.path.exists(tmp_path):
192            try:
193                os.remove(tmp_path)
194            except Exception as e:
195                logger.error(f"Failed to delete temp screenshot: {e}")

Capture a full-screen screenshot, perform OCR, and clean up temp file.

Returns: str: Normalized OCR-extracted text from the screenshot.

def text_from_url(url: str, render_js: bool = True) -> Optional[str]:
305def text_from_url(
306    url: str,
307    render_js: bool = True
308) -> Optional[str]:
309    """
310    Fetch and extract all visible text from a web page, including JS-rendered content.
311
312    Args:
313        url (str): Target webpage URL.
314        render_js (bool): Whether to render JavaScript content.
315
316    Returns:
317        Optional[str]: Cleaned full-page text, or None on failure.
318    """
319    headers = {
320        "User-Agent": random.choice(USER_AGENTS),
321        "Accept-Language": "en-US,en;q=0.9",
322        "Referer": url,
323        "DNT": "1",
324        "Upgrade-Insecure-Requests": "1"
325    }
326
327    # Try with requests-html first (with JS rendering)
328    if render_js:
329        try:
330            session = HTMLSession()
331            try:
332                r = session.get(url, headers=headers, timeout=5)
333                
334                # Set shorter timeout for rendering to avoid hanging
335                try:
336                    r.html.render(timeout=5, sleep=1, keep_page=True)
337                except Exception as e:
338                    logger.warning(f"JS rendering failed, falling back to static HTML: {e}")
339                
340                html = r.html.html
341                session.close()
342                content = text_from_html(html)
343                return content
344            except Exception as e:
345                logger.error(f"[Error with HTMLSession] {url} - {e}")
346                session.close()
347                # Fall through to regular requests
348            finally:
349                session.close()
350        except Exception as e:
351            logger.error(f"[Error creating HTMLSession] {e}")
352            # Fall through to regular requests
353    
354    # Fall back to regular requests (no JS rendering)
355    try:
356        response = requests.get(url, headers=headers, timeout=10)
357        response.raise_for_status()
358        html = response.text
359        content = text_from_html(html)
360        return content
361    except Exception as e:
362        logger.error(f"[Error with requests] {url} - {e}")
363        return None

Fetch and extract all visible text from a web page, including JS-rendered content.

Args: url (str): Target webpage URL. render_js (bool): Whether to render JavaScript content.

Returns: Optional[str]: Cleaned full-page text, or None on failure.

def text_from_html(html: str) -> str:
272def text_from_html(html: str) -> str:
273    """
274    Extract readable text from raw HTML content.
275
276    Args:
277        html (str): HTML source as a string.
278
279    Returns:
280        str: Cleaned and normalized visible text.
281    """
282    # Check if the input is a file path or HTML content
283    if os.path.isfile(html):
284        with open(html, 'r', encoding='utf-8', errors='ignore') as f:
285            html = f.read()
286    
287    soup = BeautifulSoup(html, "html.parser")
288
289    # Remove non-visible or structural elements
290    for tag in soup([
291        "script", "style",
292        "noscript", "iframe",
293        "meta", "link",
294        "header", "footer",
295        "form", "nav",
296        "aside"
297    ]):
298        tag.decompose()
299
300    text = soup.get_text(separator=" ")
301
302    return normalize_text(text)

Extract readable text from raw HTML content.

Args: html (str): HTML source as a string.

Returns: str: Cleaned and normalized visible text.

def text_from_audio(audio_file: str) -> Optional[str]:
566def text_from_audio(
567    audio_file: str
568) -> Optional[str]:
569    """
570    Transcribe audio to text using Google Speech Recognition.
571
572    Args:
573        audio_file (str): Path to the input audio file.
574
575    Returns:
576        Optional[str]: Transcription, or None on failure.
577    """
578    def convert_to_wav(file_path: str) -> str:
579        _, ext = os.path.splitext(file_path)
580        ext = ext.lstrip('.')
581        audio = AudioSegment.from_file(file_path, format=ext)
582        tmp_filename = f"audio_{uuid4().hex}.wav"
583        wav_path = os.path.join(tempfile.gettempdir(), tmp_filename)
584        audio.export(wav_path, format='wav')
585        return wav_path
586
587    recognizer = sr.Recognizer()
588    temp_wav_path = None
589    cleanup_needed = False
590
591    try:
592        _, ext = os.path.splitext(audio_file)
593        if ext.lower() not in ['.wav', '.wave']:
594            temp_wav_path = convert_to_wav(audio_file)
595            cleanup_needed = True
596        else:
597            temp_wav_path = clean_path(audio_file)
598
599        if not temp_wav_path:
600            logger.error("Invalid audio path.")
601            return None
602
603        with sr.AudioFile(temp_wav_path) as source:
604            audio = recognizer.record(source)
605        return recognizer.recognize_google(audio)
606
607    except sr.UnknownValueError:
608        logger.error("Could not understand audio.")
609    except sr.RequestError as e:
610        logger.error(f"Speech recognition error: {e}")
611    except Exception as e:
612        logger.error(f"Failed to process audio: {e}")
613    finally:
614        if cleanup_needed and temp_wav_path and os.path.exists(temp_wav_path):
615            try:
616                os.remove(temp_wav_path)
617            except Exception as e:
618                logger.error(f"Failed to delete temp WAV file {temp_wav_path}: {e}")
619
620    return None

Transcribe audio to text using Google Speech Recognition.

Args: audio_file (str): Path to the input audio file.

Returns: Optional[str]: Transcription, or None on failure.

def text_from_pdf(pdf_path: str) -> Optional[str]:
680def text_from_pdf(
681    pdf_path: str
682) -> Optional[str]:
683    """
684    Extract text and OCR results from a PDF using PyMuPDF.
685
686    Args:
687        pdf_path (str): Path to PDF file.
688
689    Returns:
690        Optional[str]: Combined normalized text and image OCR results.
691    """
692    plain_text = ""
693    temp_image_paths: List[str] = []
694
695    try:
696        doc = pymupdf.open(pdf_path)
697        for k, v in doc.metadata.items():
698            plain_text += f"{k}: {v}\n"
699
700        for i in range(len(doc)):
701            page = doc.load_page(i)
702            plain_text += f"\n--- Page {i + 1} ---\n"
703            text = page.get_text()
704            plain_text += text or "[No text]\n"
705
706            for img_index, img in enumerate(page.get_images(full=True), start=1):
707                xref = img[0]
708                base = doc.extract_image(xref)
709                img_bytes = base["image"]
710
711                img_filename = f"pdf_page{i+1}_img{img_index}_{uuid4().hex}.png"
712                img_path = os.path.join(tempfile.gettempdir(), img_filename)
713                temp_image_paths.append(img_path)
714
715                with open(img_path, "wb") as f:
716                    f.write(img_bytes)
717
718                ocr = text_from_image(img_path) or ""
719                plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n"
720
721        # Extract tables from PDF
722        """
723        try:
724            tables = extract_tables_from_pdf(pdf_path)
725            if tables:
726                plain_text += "\n--- Tables ---\n"
727                for i, table in enumerate(tables, 1):
728                    plain_text += f"\n[Table {i}]\n"
729                    if isinstance(table, dict) and "data" in table:
730                        for row in table["data"]:
731                            plain_text += str(row) + "\n"
732                    else:
733                        plain_text += str(table) + "\n"
734        except Exception as e:
735            logger.warning(f"Could not extract tables from PDF: {e}")
736        """
737
738        return normalize_text(plain_text)
739    except Exception as e:
740        logger.error(f"Error processing PDF: {e}")
741        return None
742    finally:
743        for path in temp_image_paths:
744            if os.path.exists(path):
745                try:
746                    os.remove(path)
747                except Exception as e:
748                    logger.error(f"Failed to delete temp image {path}: {e}")
749        if 'doc' in locals():
750            doc.close()

Extract text and OCR results from a PDF using PyMuPDF.

Args: pdf_path (str): Path to PDF file.

Returns: Optional[str]: Combined normalized text and image OCR results.

def text_from_doc(filepath: str, min_length: int = 4) -> str:
828def text_from_doc(
829    filepath: str,
830    min_length: int = 4
831) -> str:
832    """
833    Extract readable strings and metadata from binary Word (.doc) files.
834
835    Args:
836        filepath (str): Path to .doc file.
837        min_length (int): Minimum string length to extract.
838
839    Returns:
840        str: Metadata and text content.
841    """
842    def extract_printable_strings(
843        data: bytes
844    ) -> List[str]:
845        pattern = re.compile(
846            b'[' + re.escape(bytes(string.printable, 'ascii')) +
847            b']{%d,}' % min_length
848        )
849        found = pattern.findall(data)
850
851        results = []
852        for m in found:
853            value = m.decode(errors='ignore').strip()
854            results.append(value)
855
856        return results
857
858    def clean_strings(
859        strs: List[str]
860    ) -> List[str]:
861        cleaned: List[str] = []
862        skip = ["HYPERLINK", "OLE2", "Normal.dotm"]
863        for line in strs:
864            if any(line.startswith(pref) for pref in skip):
865                continue
866            cleaned.append(re.sub(r'\s+', ' ', line).strip())
867        return cleaned
868
869    with open(filepath, 'rb') as f:
870        data = f.read()
871
872    strings = extract_printable_strings(data)
873    strings = clean_strings(strings)
874    content = "\n".join(strings)
875
876    return normalize_text(content)

Extract readable strings and metadata from binary Word (.doc) files.

Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.

Returns: str: Metadata and text content.

def text_from_docx(file_path: str) -> Optional[str]:
879def text_from_docx(
880    file_path: str
881) -> Optional[str]:
882    """
883    Extract text, tables, and OCR from embedded images in a DOCX file.
884
885    Args:
886        file_path (str): Path to the .docx file.
887
888    Returns:
889        Optional[str]: Normalized full text content.
890    """
891    path = clean_path(file_path)
892    if not path:
893        return None
894
895    temp_image_paths: List[str] = []
896    plain_text = ""
897
898    try:
899        doc = Document(path)
900
901        for p in doc.paragraphs:
902            if p.text.strip():
903                plain_text += p.text.strip() + "\n"
904
905        for tbl in doc.tables:
906            plain_text += "\n[Table]\n"
907            for row in tbl.rows:
908                row_text = "\t".join(c.text.strip() for c in row.cells)
909                plain_text += row_text + "\n"
910
911        for rel_id, rel in doc.part.rels.items():
912            if "image" in rel.target_ref:
913                blob = rel.target_part.blob
914
915                img_filename = f"docx_img_{rel_id}_{uuid4().hex}.png"
916                img_path = os.path.join(tempfile.gettempdir(), img_filename)
917                temp_image_paths.append(img_path)
918
919                with open(img_path, "wb") as img_file:
920                    img_file.write(blob)
921
922                ocr = text_from_image(img_path) or ""
923                plain_text += f"\n[Image OCR]\n{ocr}\n"
924
925        return normalize_text(plain_text)
926
927    except Exception as e:
928        logger.error(f"Error processing DOCX: {e}")
929        return None
930    finally:
931        for path in temp_image_paths:
932            if os.path.exists(path):
933                try:
934                    os.remove(path)
935                except Exception as e:
936                    logger.error(f"Failed to delete temp DOCX image {path}: {e}")

Extract text, tables, and OCR from embedded images in a DOCX file.

Args: file_path (str): Path to the .docx file.

Returns: Optional[str]: Normalized full text content.

def text_from_excel(file_path: str) -> str:
939def text_from_excel(
940    file_path: str
941) -> str:
942    """
943    Convert an Excel workbook to CSV text.
944
945    Args:
946        file_path (str): Path to the Excel file.
947
948    Returns:
949        str: CSV-formatted string.
950    """
951    path = clean_path(file_path)
952    if not path:
953        return ""
954    try:
955        # Get all sheets
956        result = ""
957        excel_file = pd.ExcelFile(path)
958        for sheet_name in excel_file.sheet_names:
959            df = pd.read_excel(path, sheet_name=sheet_name)
960            out = StringIO()
961            df.to_csv(out, index=False)
962            result += f"\n--- Sheet: {sheet_name} ---\n"
963            result += out.getvalue()
964            result += "\n"
965        return result
966    except Exception as e:
967        logger.error(f"Failed Excel -> CSV: {e}")
968        return ""

Convert an Excel workbook to CSV text.

Args: file_path (str): Path to the Excel file.

Returns: str: CSV-formatted string.

def text_from_image(file_path: str) -> Optional[str]:
 971def text_from_image(
 972    file_path: str
 973) -> Optional[str]:
 974    """
 975    Perform OCR on an image file.
 976
 977    Args:
 978        file_path (str): Path to the image.
 979
 980    Returns:
 981        Optional[str]: Extracted text, or None on error.
 982    """
 983    path = clean_path(file_path)
 984    if not path:
 985        return None
 986    try:
 987        with Image.open(path) as img:
 988            # Improve OCR with preprocessing
 989            # 1. Convert to grayscale if it's not already
 990            if img.mode != 'L':
 991                img = img.convert('L')
 992                
 993            # 2. Optional: Apply some contrast enhancement
 994            # (Disabled by default, enable if needed for specific cases)
 995            # from PIL import ImageEnhance
 996            # enhancer = ImageEnhance.Contrast(img)
 997            # img = enhancer.enhance(1.5)  # Increase contrast
 998                
 999            # Perform OCR with custom configuration
1000            custom_config = r'--oem 3 --psm 6'  # Default OCR Engine Mode and Page Segmentation Mode
1001            txt = pytesseract.image_to_string(img, config=custom_config).strip()
1002            return normalize_text(txt) or ""
1003    except Exception as e:
1004        logger.error(f"Failed image OCR: {e}")
1005        return None

Perform OCR on an image file.

Args: file_path (str): Path to the image.

Returns: Optional[str]: Extracted text, or None on error.

def text_from_any(file_path: str) -> Optional[str]:
1064def text_from_any(
1065    file_path: str
1066) -> Optional[str]:
1067    """
1068    Handle unknown file types by reporting stats and metadata.
1069
1070    Args:
1071        file_path (str): Path to the file.
1072
1073    Returns:
1074        Optional[str]: Plain-text report, or None on error.
1075    """
1076    content = ""
1077    path = clean_path(file_path)
1078    if not path:
1079        return None
1080    try:
1081        stats = os.stat(path)
1082        info = {
1083            "path": path,
1084            "size": stats.st_size,
1085            "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
1086            "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
1087        }
1088
1089        for k, v in info.items():
1090            content += "File System Data:\n"
1091            content += f"{k}: {v}\n"
1092        
1093        # Try to extract EXIF if available
1094        exif = extract_exif(path)
1095        if exif:
1096            info["exif"] = exif
1097            content += "\n\nEXIF Data:\n"
1098            for k, v in exif.items():
1099                if isinstance(v, dict):
1100                    content += f"\n{k}:\n"
1101                    for sub_k, sub_v in v.items():
1102                        content += f"  {sub_k}: {sub_v}\n"
1103                else:
1104                    content += f"{k}: {v}\n"
1105
1106        # Get file hash
1107        md5_hash = hashlib.md5(open(path,'rb').read()).hexdigest()
1108        info["md5"] = md5_hash
1109
1110        # Get strings
1111        strings = extract_strings(path)
1112        if strings:
1113            info["strings"] = strings
1114            content += "\n\nStrings Data:\n"
1115            clean_strings = "\n".join(strings)
1116            content += clean_strings
1117
1118        return text_from_object(info) 
1119    except Exception as e:
1120        logger.error(f"Error on other file: {e}")
1121        return None

Handle unknown file types by reporting stats and metadata.

Args: file_path (str): Path to the file.

Returns: Optional[str]: Plain-text report, or None on error.

def text_from_odt(odt_path: str) -> Optional[str]:
1541def text_from_odt(odt_path: str) -> Optional[str]:
1542    """
1543    Extract text from OpenDocument Text files.
1544    
1545    Args:
1546        odt_path (str): Path to the ODT file
1547        
1548    Returns:
1549        Optional[str]: Extracted text
1550    """
1551    try:
1552        from odf import text, teletype
1553        from odf.opendocument import load
1554        
1555        textdoc = load(odt_path)
1556        
1557        # Extract metadata
1558        meta = []
1559        meta_elem = textdoc.meta
1560        if meta_elem:
1561            for prop in meta_elem.childNodes:
1562                if hasattr(prop, 'tagName') and hasattr(prop, 'childNodes') and prop.childNodes:
1563                    meta.append(f"{prop.tagName}: {teletype.extractText(prop)}")
1564        
1565        # Extract content
1566        allparas = textdoc.getElementsByType(text.P)
1567        content = "\n".join(teletype.extractText(p) for p in allparas)
1568        
1569        # Combine metadata and content
1570        if meta:
1571            final_text = "\n".join(meta) + "\n---\n" + content
1572        else:
1573            final_text = content
1574        
1575        return normalize_text(final_text)
1576    except ImportError:
1577        logger.error("odfpy not installed")
1578        return "odfpy package is required for ODT processing"
1579    except Exception as e:
1580        logger.error(f"Error processing ODT: {e}")
1581        return None

Extract text from OpenDocument Text files.

Args: odt_path (str): Path to the ODT file

Returns: Optional[str]: Extracted text

def text_from_pptx(pptx_path: str) -> Optional[str]:
1498def text_from_pptx(pptx_path: str) -> Optional[str]:
1499    """
1500    Extract text from PowerPoint presentations.
1501    
1502    Args:
1503        pptx_path (str): Path to the PowerPoint file
1504        
1505    Returns:
1506        Optional[str]: Extracted text
1507    """
1508    try:
1509        from pptx import Presentation
1510        
1511        prs = Presentation(pptx_path)
1512        text = ["--- PowerPoint Presentation ---"]
1513        
1514        for i, slide in enumerate(prs.slides, 1):
1515            slide_text = [f"Slide {i}:"]
1516            
1517            # Get slide title if it exists
1518            if slide.shapes.title and slide.shapes.title.text:
1519                slide_text.append(f"Title: {slide.shapes.title.text}")
1520            
1521            # Extract text from all shapes
1522            shape_text = []
1523            for shape in slide.shapes:
1524                if hasattr(shape, "text") and shape.text:
1525                    shape_text.append(shape.text)
1526            
1527            if shape_text:
1528                slide_text.append("\n".join(shape_text))
1529            
1530            text.append("\n".join(slide_text))
1531        
1532        return normalize_text("\n\n".join(text))
1533    except ImportError:
1534        logger.error("python-pptx not installed")
1535        return "python-pptx package is required for PowerPoint processing"
1536    except Exception as e:
1537        logger.error(f"Error processing PowerPoint: {e}")
1538        return None

Extract text from PowerPoint presentations.

Args: pptx_path (str): Path to the PowerPoint file

Returns: Optional[str]: Extracted text

def text_from_epub(epub_path: str) -> Optional[str]:
1451def text_from_epub(epub_path: str) -> Optional[str]:
1452    """
1453    Extract text from EPUB ebooks.
1454    
1455    Args:
1456        epub_path (str): Path to the EPUB file
1457        
1458    Returns:
1459        Optional[str]: Extracted text
1460    """
1461    try:
1462        from ebooklib import epub
1463        import html2text
1464        
1465        book = epub.read_epub(epub_path)
1466        h = html2text.HTML2Text()
1467        h.ignore_links = False
1468        
1469        content = []
1470        
1471        # Get book metadata
1472        metadata = []
1473        if book.get_metadata('DC', 'title'):
1474            metadata.append(f"Title: {book.get_metadata('DC', 'title')[0][0]}")
1475        if book.get_metadata('DC', 'creator'):
1476            metadata.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
1477        if book.get_metadata('DC', 'description'):
1478            metadata.append(f"Description: {book.get_metadata('DC', 'description')[0][0]}")
1479        
1480        if metadata:
1481            content.append("\n".join(metadata))
1482            content.append("---")
1483        
1484        # Get book content
1485        for item in book.get_items():
1486            if item.get_type() == epub.ITEM_DOCUMENT:
1487                content.append(h.handle(item.get_content().decode('utf-8')))
1488        
1489        return normalize_text("\n".join(content))
1490    except ImportError:
1491        logger.error("ebooklib and/or html2text not installed")
1492        return "ebooklib and/or html2text packages are required for EPUB processing"
1493    except Exception as e:
1494        logger.error(f"Error processing EPUB: {e}")
1495        return None

Extract text from EPUB ebooks.

Args: epub_path (str): Path to the EPUB file

Returns: Optional[str]: Extracted text

def analyze_text(text: str) -> Dict[str, Any]:
1409def analyze_text(text: str) -> Dict[str, Any]:
1410    """
1411    Perform basic text analytics.
1412    
1413    Args:
1414        text (str): Input text
1415        
1416    Returns:
1417        Dict: Analysis results
1418    """
1419    try:
1420        # Tokenize text
1421        words = nltk.word_tokenize(text.lower())
1422        sentences = nltk.sent_tokenize(text)
1423        
1424        # Filter out punctuation
1425        words = [word for word in words if word.isalpha()]
1426        
1427        # Count word frequencies
1428        word_freq = Counter(words)
1429        
1430        # Calculate readability metrics
1431        avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
1432        avg_sent_length = len(words) / len(sentences) if sentences else 0
1433        
1434        # Detect language
1435        language = detect_language(text)
1436        
1437        return {
1438            "word_count": len(words),
1439            "sentence_count": len(sentences),
1440            "unique_words": len(set(words)),
1441            "avg_word_length": avg_word_length,
1442            "avg_sentence_length": avg_sent_length,
1443            "language": language,
1444            "most_common_words": word_freq.most_common(20)
1445        }
1446    except Exception as e:
1447        logger.error(f"Text analysis error: {e}")
1448        return {"error": str(e)}

Perform basic text analytics.

Args: text (str): Input text

Returns: Dict: Analysis results

def summarize_text(text: str, sentences: int = 5) -> str:
1341def summarize_text(text: str, sentences: int = 5) -> str:
1342    """
1343    Create a simple extractive summary from the text.
1344    
1345    Args:
1346        text (str): Input text to summarize
1347        sentences (int): Number of sentences to include
1348        
1349    Returns:
1350        str: Summarized text
1351    """
1352    try:
1353        from nltk.corpus import stopwords
1354        from nltk.tokenize import sent_tokenize
1355        
1356        # Download required NLTK data if not already present
1357        try:
1358            nltk.data.find('tokenizers/punkt')
1359        except LookupError:
1360            nltk.download('punkt', quiet=True)
1361        try:
1362            nltk.data.find('corpora/stopwords')
1363        except LookupError:
1364            nltk.download('stopwords', quiet=True)
1365        
1366        # Tokenize and calculate word frequencies
1367        stop_words = set(stopwords.words('english'))
1368        sentences_list = sent_tokenize(text)
1369        
1370        # If there are fewer sentences than requested, return all
1371        if len(sentences_list) <= sentences:
1372            return text
1373        
1374        word_frequencies = {}
1375        for sentence in sentences_list:
1376            for word in nltk.word_tokenize(sentence):
1377                word = word.lower()
1378                if word not in stop_words:
1379                    if word not in word_frequencies:
1380                        word_frequencies[word] = 1
1381                    else:
1382                        word_frequencies[word] += 1
1383        
1384        # Normalize frequencies
1385        maximum_frequency = max(word_frequencies.values()) if word_frequencies else 1
1386        for word in word_frequencies:
1387            word_frequencies[word] = word_frequencies[word] / maximum_frequency
1388        
1389        # Score sentences
1390        sentence_scores = {}
1391        for i, sentence in enumerate(sentences_list):
1392            for word in nltk.word_tokenize(sentence.lower()):
1393                if word in word_frequencies:
1394                    if i not in sentence_scores:
1395                        sentence_scores[i] = word_frequencies[word]
1396                    else:
1397                        sentence_scores[i] += word_frequencies[word]
1398        
1399        # Get top N sentences
1400        summary_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:sentences]
1401        summary_sentences = [sentences_list[i] for i, _ in sorted(summary_sentences)]
1402        
1403        return ' '.join(summary_sentences)
1404    except Exception as e:
1405        logger.error(f"Summarization error: {e}")
1406        return text

Create a simple extractive summary from the text.

Args: text (str): Input text to summarize sentences (int): Number of sentences to include

Returns: str: Summarized text

def translate_text(text: str, target_lang: str = 'en') -> Optional[str]:
1287def translate_text(text: str, target_lang: str = "en") -> Optional[str]:
1288    """
1289    Translate text to target language.
1290    
1291    Args:
1292        text (str): Input text to translate
1293        target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese)
1294        
1295    Returns:
1296        Optional[str]: Translated text or None on failure
1297    """
1298    try:
1299        # Use a more stable translation library
1300        # Note: googletrans 4.0.0-rc1 uses async methods which need to be awaited
1301        # Let's use the deep-translator library instead which is more stable
1302        from deep_translator import GoogleTranslator
1303        
1304        # Handle long texts by splitting into chunks (Google has a limit)
1305        max_chunk_size = 4500  # Google Translate has a limit around 5000 chars
1306        chunks = []
1307        
1308        # Split text into chunks of appropriate size (at sentence boundaries if possible)
1309        text_remaining = text
1310        while len(text_remaining) > 0:
1311            if len(text_remaining) <= max_chunk_size:
1312                chunks.append(text_remaining)
1313                break
1314                
1315            # Try to find a sentence boundary near the max chunk size
1316            chunk_end = max_chunk_size
1317            while chunk_end > 0 and text_remaining[chunk_end] not in ['.', '!', '?', '\n']:
1318                chunk_end -= 1
1319                
1320            # If no good sentence boundary found, just use max size
1321            if chunk_end == 0:
1322                chunk_end = max_chunk_size
1323            else:
1324                chunk_end += 1  # Include the period or boundary character
1325                
1326            chunks.append(text_remaining[:chunk_end])
1327            text_remaining = text_remaining[chunk_end:]
1328            
1329        # Translate each chunk and combine
1330        translated_chunks = []
1331        for chunk in chunks:
1332            translated_chunk = GoogleTranslator(source='auto', target=target_lang).translate(chunk)
1333            translated_chunks.append(translated_chunk)
1334            
1335        return ' '.join(translated_chunks)
1336    except Exception as e:
1337        logger.error(f"Translation error: {e}")
1338        return None

Translate text to target language.

Args: text (str): Input text to translate target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese)

Returns: Optional[str]: Translated text or None on failure

def list_available_languages() -> Dict[str, str]:
1259def list_available_languages() -> Dict[str, str]:
1260    """
1261    Get a dictionary of available languages for translation.
1262
1263    Returns:
1264        Dict[str, str]: Dictionary mapping language codes to language names
1265    """
1266    try:
1267        from deep_translator import GoogleTranslator
1268        # Get available languages from the translator
1269        languages = GoogleTranslator().get_supported_languages(as_dict=True)
1270        return languages
1271    except Exception as e:
1272        logger.error(f"Error getting language list: {e}")
1273        # Return a small subset as fallback
1274        return {
1275            "en": "English",
1276            "es": "Spanish",
1277            "fr": "French",
1278            "de": "German",
1279            "it": "Italian",
1280            "ja": "Japanese",
1281            "ko": "Korean",
1282            "zh-cn": "Chinese (Simplified)",
1283            "ru": "Russian",
1284            "ar": "Arabic"
1285        }

Get a dictionary of available languages for translation.

Returns: Dict[str, str]: Dictionary mapping language codes to language names

def detect_language(text: str) -> str:
1242def detect_language(text: str) -> str:
1243    """
1244    Detect the language of the extracted text.
1245    
1246    Args:
1247        text (str): Input text
1248        
1249    Returns:
1250        str: Detected language code or 'unknown'
1251    """
1252    try:
1253        import langdetect
1254        return langdetect.detect(text)
1255    except:
1256        logger.warning("Language detection failed or langdetect not installed")
1257        return "unknown"

Detect the language of the extracted text.

Args: text (str): Input text

Returns: str: Detected language code or 'unknown'

def scrape_website( url: str, max_pages: int = 1, stay_on_domain: bool = True) -> Dict[str, str]:
366def scrape_website(url: str, max_pages: int = 1, stay_on_domain: bool = True) -> Dict[str, str]:
367    """
368    Scrape multiple pages of a website.
369    
370    Args:
371        url (str): Starting URL
372        max_pages (int): Maximum pages to scrape
373        stay_on_domain (bool): Whether to stay on the same domain
374        
375    Returns:
376        Dict[str, str]: Dictionary mapping URLs to extracted text
377    """
378    results = {}
379    visited = set()
380    to_visit = [url]
381    base_domain = urlparse(url).netloc
382    
383    while to_visit and len(visited) < max_pages:
384        current_url = to_visit.pop(0)
385        if current_url in visited:
386            continue
387            
388        # Extract text from current page
389        text = text_from_url(current_url)
390        if text:
391            results[current_url] = text
392            
393        visited.add(current_url)
394        
395        # Find links on the page
396        session = HTMLSession()
397        try:
398            r = session.get(current_url)
399            r.html.render(timeout=20, sleep=1)
400            
401            links = r.html.absolute_links
402            for link in links:
403                link_domain = urlparse(link).netloc
404                if link not in visited and link not in to_visit:
405                    # Check if we should follow this link
406                    if stay_on_domain and link_domain != base_domain:
407                        continue
408                    to_visit.append(link)
409        except Exception as e:
410            logger.error(f"Error scraping {current_url}: {e}")
411        finally:
412            session.close()
413    
414    return results

Scrape multiple pages of a website.

Args: url (str): Starting URL max_pages (int): Maximum pages to scrape stay_on_domain (bool): Whether to stay on the same domain

Returns: Dict[str, str]: Dictionary mapping URLs to extracted text

def normalize_text(text: str) -> str:
125def normalize_text(
126    text: str
127) -> str:
128    """
129    Replace multiple consecutive newlines, carriage returns, and spaces
130    with a single space. Ensures compact, single-line output.
131
132    Args:
133        text (str): Raw input text.
134
135    Returns:
136        str: Normalized single-line text.
137    """
138    if not text:
139        return ""
140    text = unicodedata.normalize("NFKC", text)
141    text = re.sub(r' +', ' ', text)
142    text = re.sub(r'\n+', '\n', text)
143    text = re.sub(r'(?m)(^ \n)+', '\n', text)
144    text = re.sub(r'\t+', '\t', text)
145    text = re.sub(r'\r+', '\n', text)
146    text = re.sub(r"^ ", "", text, flags=re.MULTILINE)
147    return text 

Replace multiple consecutive newlines, carriage returns, and spaces with a single space. Ensures compact, single-line output.

Args: text (str): Raw input text.

Returns: str: Normalized single-line text.