mrblack

 1#!/usr/bin/env python3
 2# -*- coding: utf-8 -*-
 3#
 4# File: __init__.py
 5# Author: Wadih Khairallah
 6# Description: 
 7# Created: 2025-05-12 16:47:22
 8# Modified: 2025-05-15 16:30:26
 9
10from .pii import (
11    extract_pii_text,
12    extract_pii_file,
13    extract_pii_url,
14    extract_pii_image,
15    extract_pii_screenshot
16)
17from .textextract import (
18    extract_text,
19    extract_text_with_password,
20    extract_exif,
21    extract_metadata,
22    text_from_screenshot,
23    text_from_url,
24    text_from_html,
25    text_from_audio,
26    text_from_pdf,
27    text_from_doc,
28    text_from_docx,
29    text_from_excel,
30    text_from_image,
31    text_from_any,
32    text_from_odt,
33    text_from_pptx,
34    text_from_epub,
35    analyze_text,
36    summarize_text,
37    translate_text,
38    list_available_languages,
39    detect_language,
40    scrape_website,
41    normalize_text,
42
43)
44
45__all__ = [
46    "extract_pii_text",
47    "extract_pii_file",
48    "extract_pii_url",
49    "extract_pii_image",
50    "extract_pii_screenshot",
51    "extract_text_with_password",
52    "extract_text",
53    "extract_exif",
54    "extract_metadata",
55    "text_from_screenshot",
56    "text_from_url",
57    "text_from_html",
58    "text_from_audio",
59    "text_from_pdf",
60    "text_from_doc",
61    "text_from_docx",
62    "text_from_excel",
63    "text_from_image",
64    "text_from_any",
65    "text_from_odt",
66    "text_from_pptx",
67    "text_from_epub",
68    "analyze_text",
69    "summarize_text",
70    "translate_text",
71    "list_available_languages",
72    "detect_language",
73    "scrape_website",
74    "normalize_text"
75]
def extract_pii_text( text: str, labels: Union[List[str], str, NoneType] = None) -> Dict[str, List[str]]:
 71def extract_pii_text(
 72    text: str,
 73    labels: Optional[Union[List[str], str]] = None
 74) -> Dict[str, List[str]]:
 75    """
 76    Extract PII matches from provided text.
 77
 78    Args:
 79        text (str): The input text to scan for patterns.
 80        labels (Optional[Union[List[str], str]]): Specific labels to filter on.
 81
 82    Returns:
 83        Dict[str, List[str]]: Mapping of each label to a sorted list of
 84        matched and cleaned strings.
 85    """
 86    if isinstance(labels, str):
 87        labels = [labels]
 88    patterns = PATTERNS
 89    if labels:
 90        patterns = [
 91            p for p in PATTERNS
 92            if any(re.search(rf"\(\?P<{lbl}>", p) for lbl in labels)
 93        ]
 94    results: Dict[str, set] = defaultdict(set)
 95    for pattern in patterns:
 96        try:
 97            rx = re.compile(pattern)
 98            for m in rx.finditer(text):
 99                for lbl, val in m.groupdict().items():
100                    if not val:
101                        continue
102                    cleaned = _clean_value(lbl, val)
103                    if lbl == "url":
104                        cleaned = cleaned.rstrip("),.**")
105                    if cleaned is not None:
106                        results[lbl].add(cleaned)
107        except re.error as e:
108            print(
109                f"Invalid regex skipped: {pattern}{e}",
110                file=sys.stderr
111            )
112    return {lbl: sorted(vals) for lbl, vals in results.items()}

Extract PII matches from provided text.

Args: text (str): The input text to scan for patterns. labels (Optional[Union[List[str], str]]): Specific labels to filter on.

Returns: Dict[str, List[str]]: Mapping of each label to a sorted list of matched and cleaned strings.

def extract_pii_file( file_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
115def extract_pii_file(
116    file_path: str,
117    labels: Optional[Union[List[str], str]] = None
118) -> Optional[Dict[str, List[str]]]:
119    """
120    Extract PII from a single file's text content.
121
122    Args:
123        file_path (str): Path to the file.
124        labels (Optional[Union[List[str], str]]): Labels to filter.
125
126    Returns:
127        Optional[Dict[str, List[str]]]: Extraction results, or None.
128    """
129    text = extract_text(file_path)
130    if not text:
131        return None
132    data = extract_pii_text(text, labels)
133    return data or None

Extract PII from a single file's text content.

Args: file_path (str): Path to the file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_url( path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
136def extract_pii_url(
137    path: str,
138    labels: Optional[Union[List[str], str]] = None
139) -> Optional[Dict[str, List[str]]]:
140    """
141    Extract PII from the text at a URL.
142
143    Args:
144        path (str): The URL to fetch.
145        labels (Optional[Union[List[str], str]]): Labels to filter.
146
147    Returns:
148        Optional[Dict[str, List[str]]]: Extraction results, or None.
149    """
150    text = text_from_url(path)
151    if not text:
152        return None
153    data = extract_pii_text(text, labels)
154    return data or None

Extract PII from the text at a URL.

Args: path (str): The URL to fetch. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_image( image_path: str, labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
157def extract_pii_image(
158    image_path: str,
159    labels: Optional[Union[List[str], str]] = None
160) -> Optional[Dict[str, List[str]]]:
161    """
162    Extract PII from an image using OCR.
163
164    Args:
165        image_path (str): Path to the image file.
166        labels (Optional[Union[List[str], str]]): Labels to filter.
167
168    Returns:
169        Optional[Dict[str, List[str]]]: Extraction results, or None.
170    """
171    path = clean_path(image_path)
172    if not path or not os.path.isfile(path):
173        print(f"[red]Invalid image path:[/] {image_path}")
174        return None
175    text = extract_text(path)
176    if not text:
177        return None
178    data = extract_pii_text(text, labels)
179    return data or None

Extract PII from an image using OCR.

Args: image_path (str): Path to the image file. labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_pii_screenshot( labels: Union[List[str], str, NoneType] = None) -> Optional[Dict[str, List[str]]]:
182def extract_pii_screenshot(
183    labels: Optional[Union[List[str], str]] = None
184) -> Optional[Dict[str, List[str]]]:
185    """
186    Capture a screenshot and extract PII from its OCR text.
187
188    Args:
189        labels (Optional[Union[List[str], str]]): Labels to filter.
190
191    Returns:
192        Optional[Dict[str, List[str]]]: Extraction results, or None.
193    """
194    text = text_from_screenshot()
195    if not text:
196        return None
197    data = extract_pii_text(text, labels)
198    return data or None

Capture a screenshot and extract PII from its OCR text.

Args: labels (Optional[Union[List[str], str]]): Labels to filter.

Returns: Optional[Dict[str, List[str]]]: Extraction results, or None.

def extract_text_with_password(file_path: str, password: str) -> Optional[str]:
502def extract_text_with_password(file_path: str, password: str) -> Optional[str]:
503    """
504    Extract text from password-protected files.
505    
506    Args:
507        file_path (str): Path to the file
508        password (str): Password to unlock the file
509        
510    Returns:
511        Optional[str]: Extracted text
512    """
513    file_ext = os.path.splitext(file_path)[1].lower()
514    
515    if file_ext == '.pdf':
516        return text_from_pdf_protected(file_path, password)
517    elif file_ext in ['.docx', '.xlsx', '.pptx']:
518        return text_from_office_protected(file_path, password)
519    else:
520        logger.warning(f"Password protection not supported for {file_ext} files")
521        return None

Extract text from password-protected files.

Args: file_path (str): Path to the file password (str): Password to unlock the file

Returns: Optional[str]: Extracted text

def extract_text(file_path: str) -> Optional[str]:
416def extract_text(
417    file_path: str
418) -> Optional[str]:
419    """
420    Extract text content from a local file or URL.
421
422    Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.
423
424    Args:
425        file_path (str): Path to the input file or URL.
426
427    Returns:
428        Optional[str]: Extracted text, or None if unsupported or error.
429    """
430    if is_url(file_path):
431        return text_from_url(file_path)
432
433    TEXT_MIME_TYPES = {
434        "application/json", "application/xml", "application/x-yaml",
435        "application/x-toml", "application/x-csv", "application/x-markdown",
436    }
437
438    path = clean_path(file_path)
439    if not path:
440        logger.error(f"No such file: {file_path}")
441        return None
442
443    mime_type = magic.from_file(path, mime=True)
444    try:
445        if mime_type.startswith("text/html"):
446            content = text_from_html(path)
447            return content
448
449        if mime_type.startswith("text/") or mime_type in TEXT_MIME_TYPES:
450            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
451                content = f.read()
452            return normalize_text(content)
453
454        elif mime_type in [
455            "application/vnd.ms-excel",
456            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
457        ]:
458            content = text_from_excel(path)
459            return content
460
461        elif mime_type == "application/pdf":
462            content = text_from_pdf(path)
463            return content
464
465        elif mime_type == \
466            "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
467            content = text_from_docx(path)
468            return content
469
470        elif mime_type == "application/msword":
471            content = text_from_doc(path)
472            return content
473
474        elif mime_type.startswith("image/"):
475            content = text_from_image(path)
476            return content
477
478        elif mime_type.startswith("audio/"):
479            content = text_from_audio(path)
480            return content
481
482        elif mime_type == "application/epub+zip":
483            content = text_from_epub(path)
484            return content
485
486        elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
487            content = text_from_pptx(path)
488            return content
489
490        elif mime_type == "application/vnd.oasis.opendocument.text":
491            content = text_from_odt(path)
492            return content
493
494        else:
495            content = text_from_any(path)
496            return content
497    except Exception as e:
498        logger.error(f"Error reading {path}: {e}")
499        return None

Extract text content from a local file or URL.

Supports web pages, text, JSON, XML, CSV, Excel, PDF, DOCX, images, audio.

Args: file_path (str): Path to the input file or URL.

Returns: Optional[str]: Extracted text, or None if unsupported or error.

def extract_exif(file_path: str) -> Optional[Dict[str, Any]]:
197def extract_exif(
198    file_path: str
199) -> Optional[Dict[str, Any]]:
200    """
201    Extract EXIF metadata from a file using exiftool.
202
203    Args:
204        file_path (str): Path to the target file.
205
206    Returns:
207        Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.
208    """
209    exif_data: Optional[Dict[str, Any]] = None
210    try:
211        result = subprocess.run(
212            ['exiftool', '-j', file_path],
213            stdout=subprocess.PIPE,
214            stderr=subprocess.PIPE
215        )
216        if result.returncode == 0:
217            exif_data = json.loads(result.stdout.decode())[0]
218    except Exception as e:
219        logger.error(f"Exiftool failed: {e}")
220    return exif_data

Extract EXIF metadata from a file using exiftool.

Args: file_path (str): Path to the target file.

Returns: Optional[Dict[str, Any]]: Parsed EXIF data, or None on failure.

def extract_metadata(file_path: str) -> Dict[str, Any]:
1052def extract_metadata(
1053    file_path: str
1054) -> Dict[str, Any]:
1055    """
1056    Extract comprehensive metadata from any file type.
1057
1058    Args:
1059        file_path (str): Path to target file.
1060
1061    Returns:
1062        Dict[str, Any]: Nested metadata structure.
1063    """
1064    path = clean_path(file_path)
1065    if not path:
1066        return {"error": "File not found"}
1067    
1068    meta: Dict[str, Any] = {}
1069    try:
1070        stats = os.stat(path)
1071        meta["size_bytes"] = stats.st_size
1072        meta["created"] = datetime.fromtimestamp(stats.st_ctime).isoformat()
1073        meta["modified"] = datetime.fromtimestamp(stats.st_mtime).isoformat()
1074        meta["mime"] = magic.from_file(path, mime=True)
1075        
1076        # Calculate multiple hash types
1077        with open(path, 'rb') as f:
1078            content = f.read()
1079            meta["hashes"] = {
1080                "md5": hashlib.md5(content).hexdigest(),
1081                "sha1": hashlib.sha1(content).hexdigest(),
1082                "sha256": hashlib.sha256(content).hexdigest()
1083            }
1084        
1085        # Get extended file attributes where supported
1086        if hasattr(os, 'listxattr'):
1087            try:
1088                xattrs = os.listxattr(path)
1089                if xattrs:
1090                    meta["xattrs"] = {}
1091                    for attr in xattrs:
1092                        meta["xattrs"][attr] = os.getxattr(path, attr)
1093            except (OSError, AttributeError):
1094                pass
1095        
1096        # Get EXIF data if available and relevant
1097        exif = extract_exif(path)
1098        if exif:
1099            meta["exif"] = exif
1100            
1101        # Get file owner and permissions
1102        import pwd
1103        try:
1104            meta["owner"] = pwd.getpwuid(stats.st_uid).pw_name
1105        except KeyError:
1106            meta["owner"] = str(stats.st_uid)
1107        meta["permissions"] = oct(stats.st_mode)[-3:]
1108            
1109    except Exception as e:
1110        meta["error"] = str(e)
1111        
1112    return meta

Extract comprehensive metadata from any file type.

Args: file_path (str): Path to target file.

Returns: Dict[str, Any]: Nested metadata structure.

def text_from_screenshot() -> str:
163def text_from_screenshot() -> str:
164    """
165    Capture a full-screen screenshot, perform OCR, and clean up temp file.
166
167    Returns:
168        str: Normalized OCR-extracted text from the screenshot.
169    """
170    tmp_filename = f"screenshot_{uuid4().hex}.png"
171    tmp_path = os.path.join(tempfile.gettempdir(), tmp_filename)
172
173    try:
174        with mss() as sct:
175            monitor = {"top": 0, "left": 0, "width": 0, "height": 0}
176            for mon in sct.monitors:
177                monitor["left"] = min(mon["left"], monitor["left"])
178                monitor["top"] = min(mon["top"], monitor["top"])
179                monitor["width"] = max(mon["width"] + mon["left"] - monitor["left"], monitor["width"])
180                monitor["height"] = max(mon["height"] + mon["top"] - monitor["top"], monitor["height"])
181            screenshot = sct.grab(monitor)
182
183        img = Image.frombytes("RGB", screenshot.size, screenshot.bgra, "raw", "BGRX")
184        img_gray = img.convert("L")
185        img_gray.save(tmp_path)
186
187        content = text_from_image(tmp_path)
188        return normalize_text(content)
189    finally:
190        if os.path.exists(tmp_path):
191            try:
192                os.remove(tmp_path)
193            except Exception as e:
194                logger.error(f"Failed to delete temp screenshot: {e}")

Capture a full-screen screenshot, perform OCR, and clean up temp file.

Returns: str: Normalized OCR-extracted text from the screenshot.

def text_from_url(url: str, render_js: bool = True) -> Optional[str]:
304def text_from_url(
305    url: str,
306    render_js: bool = True
307) -> Optional[str]:
308    """
309    Fetch and extract all visible text from a web page, including JS-rendered content.
310
311    Args:
312        url (str): Target webpage URL.
313        render_js (bool): Whether to render JavaScript content.
314
315    Returns:
316        Optional[str]: Cleaned full-page text, or None on failure.
317    """
318    headers = {
319        "User-Agent": random.choice(USER_AGENTS),
320        "Accept-Language": "en-US,en;q=0.9",
321        "Referer": url,
322        "DNT": "1",
323        "Upgrade-Insecure-Requests": "1"
324    }
325
326    # Try with requests-html first (with JS rendering)
327    if render_js:
328        try:
329            session = HTMLSession()
330            try:
331                r = session.get(url, headers=headers, timeout=20)
332                
333                # Set shorter timeout for rendering to avoid hanging
334                try:
335                    r.html.render(timeout=10, sleep=1, keep_page=True)
336                except Exception as e:
337                    logger.warning(f"JS rendering failed, falling back to static HTML: {e}")
338                
339                html = r.html.html
340                session.close()
341                content = text_from_html(html)
342                return content
343            except Exception as e:
344                logger.error(f"[Error with HTMLSession] {url} - {e}")
345                session.close()
346                # Fall through to regular requests
347            finally:
348                session.close()
349        except Exception as e:
350            logger.error(f"[Error creating HTMLSession] {e}")
351            # Fall through to regular requests
352    
353    # Fall back to regular requests (no JS rendering)
354    try:
355        response = requests.get(url, headers=headers, timeout=10)
356        response.raise_for_status()
357        html = response.text
358        content = text_from_html(html)
359        return content
360    except Exception as e:
361        logger.error(f"[Error with requests] {url} - {e}")
362        return None

Fetch and extract all visible text from a web page, including JS-rendered content.

Args: url (str): Target webpage URL. render_js (bool): Whether to render JavaScript content.

Returns: Optional[str]: Cleaned full-page text, or None on failure.

def text_from_html(html: str) -> str:
271def text_from_html(html: str) -> str:
272    """
273    Extract readable text from raw HTML content.
274
275    Args:
276        html (str): HTML source as a string.
277
278    Returns:
279        str: Cleaned and normalized visible text.
280    """
281    # Check if the input is a file path or HTML content
282    if os.path.isfile(html):
283        with open(html, 'r', encoding='utf-8', errors='ignore') as f:
284            html = f.read()
285    
286    soup = BeautifulSoup(html, "html.parser")
287
288    # Remove non-visible or structural elements
289    for tag in soup([
290        "script", "style",
291        "noscript", "iframe",
292        "meta", "link",
293        "header", "footer",
294        "form", "nav",
295        "aside"
296    ]):
297        tag.decompose()
298
299    text = soup.get_text(separator=" ")
300
301    return normalize_text(text)

Extract readable text from raw HTML content.

Args: html (str): HTML source as a string.

Returns: str: Cleaned and normalized visible text.

def text_from_audio(audio_file: str) -> Optional[str]:
565def text_from_audio(
566    audio_file: str
567) -> Optional[str]:
568    """
569    Transcribe audio to text using Google Speech Recognition.
570
571    Args:
572        audio_file (str): Path to the input audio file.
573
574    Returns:
575        Optional[str]: Transcription, or None on failure.
576    """
577    def convert_to_wav(file_path: str) -> str:
578        _, ext = os.path.splitext(file_path)
579        ext = ext.lstrip('.')
580        audio = AudioSegment.from_file(file_path, format=ext)
581        tmp_filename = f"audio_{uuid4().hex}.wav"
582        wav_path = os.path.join(tempfile.gettempdir(), tmp_filename)
583        audio.export(wav_path, format='wav')
584        return wav_path
585
586    recognizer = sr.Recognizer()
587    temp_wav_path = None
588    cleanup_needed = False
589
590    try:
591        _, ext = os.path.splitext(audio_file)
592        if ext.lower() not in ['.wav', '.wave']:
593            temp_wav_path = convert_to_wav(audio_file)
594            cleanup_needed = True
595        else:
596            temp_wav_path = clean_path(audio_file)
597
598        if not temp_wav_path:
599            logger.error("Invalid audio path.")
600            return None
601
602        with sr.AudioFile(temp_wav_path) as source:
603            audio = recognizer.record(source)
604        return recognizer.recognize_google(audio)
605
606    except sr.UnknownValueError:
607        logger.error("Could not understand audio.")
608    except sr.RequestError as e:
609        logger.error(f"Speech recognition error: {e}")
610    except Exception as e:
611        logger.error(f"Failed to process audio: {e}")
612    finally:
613        if cleanup_needed and temp_wav_path and os.path.exists(temp_wav_path):
614            try:
615                os.remove(temp_wav_path)
616            except Exception as e:
617                logger.error(f"Failed to delete temp WAV file {temp_wav_path}: {e}")
618
619    return None

Transcribe audio to text using Google Speech Recognition.

Args: audio_file (str): Path to the input audio file.

Returns: Optional[str]: Transcription, or None on failure.

def text_from_pdf(pdf_path: str) -> Optional[str]:
679def text_from_pdf(
680    pdf_path: str
681) -> Optional[str]:
682    """
683    Extract text and OCR results from a PDF using PyMuPDF.
684
685    Args:
686        pdf_path (str): Path to PDF file.
687
688    Returns:
689        Optional[str]: Combined normalized text and image OCR results.
690    """
691    plain_text = ""
692    temp_image_paths: List[str] = []
693
694    try:
695        doc = pymupdf.open(pdf_path)
696        for k, v in doc.metadata.items():
697            plain_text += f"{k}: {v}\n"
698
699        for i in range(len(doc)):
700            page = doc.load_page(i)
701            plain_text += f"\n--- Page {i + 1} ---\n"
702            text = page.get_text()
703            plain_text += text or "[No text]\n"
704
705            for img_index, img in enumerate(page.get_images(full=True), start=1):
706                xref = img[0]
707                base = doc.extract_image(xref)
708                img_bytes = base["image"]
709
710                img_filename = f"pdf_page{i+1}_img{img_index}_{uuid4().hex}.png"
711                img_path = os.path.join(tempfile.gettempdir(), img_filename)
712                temp_image_paths.append(img_path)
713
714                with open(img_path, "wb") as f:
715                    f.write(img_bytes)
716
717                ocr = text_from_image(img_path) or ""
718                plain_text += f"\n[Image {img_index} OCR]\n{ocr}\n"
719
720        # Extract tables from PDF
721        """
722        try:
723            tables = extract_tables_from_pdf(pdf_path)
724            if tables:
725                plain_text += "\n--- Tables ---\n"
726                for i, table in enumerate(tables, 1):
727                    plain_text += f"\n[Table {i}]\n"
728                    if isinstance(table, dict) and "data" in table:
729                        for row in table["data"]:
730                            plain_text += str(row) + "\n"
731                    else:
732                        plain_text += str(table) + "\n"
733        except Exception as e:
734            logger.warning(f"Could not extract tables from PDF: {e}")
735        """
736
737        return normalize_text(plain_text)
738    except Exception as e:
739        logger.error(f"Error processing PDF: {e}")
740        return None
741    finally:
742        for path in temp_image_paths:
743            if os.path.exists(path):
744                try:
745                    os.remove(path)
746                except Exception as e:
747                    logger.error(f"Failed to delete temp image {path}: {e}")
748        if 'doc' in locals():
749            doc.close()

Extract text and OCR results from a PDF using PyMuPDF.

Args: pdf_path (str): Path to PDF file.

Returns: Optional[str]: Combined normalized text and image OCR results.

def text_from_doc(filepath: str, min_length: int = 4) -> str:
827def text_from_doc(
828    filepath: str,
829    min_length: int = 4
830) -> str:
831    """
832    Extract readable strings and metadata from binary Word (.doc) files.
833
834    Args:
835        filepath (str): Path to .doc file.
836        min_length (int): Minimum string length to extract.
837
838    Returns:
839        str: Metadata and text content.
840    """
841    def extract_printable_strings(
842        data: bytes
843    ) -> List[str]:
844        pattern = re.compile(
845            b'[' + re.escape(bytes(string.printable, 'ascii')) +
846            b']{%d,}' % min_length
847        )
848        found = pattern.findall(data)
849        return list(dict.fromkeys(m.decode(errors='ignore').strip()
850                                   for m in found))
851
852    def clean_strings(
853        strs: List[str]
854    ) -> List[str]:
855        cleaned: List[str] = []
856        skip = ["HYPERLINK", "OLE2", "Normal.dotm"]
857        for line in strs:
858            if any(line.startswith(pref) for pref in skip):
859                continue
860            cleaned.append(re.sub(r'\s+', ' ', line).strip())
861        return cleaned
862
863    with open(filepath, 'rb') as f:
864        data = f.read()
865    strings = extract_printable_strings(data)
866    strings = clean_strings(strings)
867    content = "\n".join(strings)
868    return normalize_text(content)

Extract readable strings and metadata from binary Word (.doc) files.

Args: filepath (str): Path to .doc file. min_length (int): Minimum string length to extract.

Returns: str: Metadata and text content.

def text_from_docx(file_path: str) -> Optional[str]:
871def text_from_docx(
872    file_path: str
873) -> Optional[str]:
874    """
875    Extract text, tables, and OCR from embedded images in a DOCX file.
876
877    Args:
878        file_path (str): Path to the .docx file.
879
880    Returns:
881        Optional[str]: Normalized full text content.
882    """
883    path = clean_path(file_path)
884    if not path:
885        return None
886
887    temp_image_paths: List[str] = []
888    plain_text = ""
889
890    try:
891        doc = Document(path)
892
893        for p in doc.paragraphs:
894            if p.text.strip():
895                plain_text += p.text.strip() + "\n"
896
897        for tbl in doc.tables:
898            plain_text += "\n[Table]\n"
899            for row in tbl.rows:
900                row_text = "\t".join(c.text.strip() for c in row.cells)
901                plain_text += row_text + "\n"
902
903        for rel_id, rel in doc.part.rels.items():
904            if "image" in rel.target_ref:
905                blob = rel.target_part.blob
906
907                img_filename = f"docx_img_{rel_id}_{uuid4().hex}.png"
908                img_path = os.path.join(tempfile.gettempdir(), img_filename)
909                temp_image_paths.append(img_path)
910
911                with open(img_path, "wb") as img_file:
912                    img_file.write(blob)
913
914                ocr = text_from_image(img_path) or ""
915                plain_text += f"\n[Image OCR]\n{ocr}\n"
916
917        return normalize_text(plain_text)
918
919    except Exception as e:
920        logger.error(f"Error processing DOCX: {e}")
921        return None
922    finally:
923        for path in temp_image_paths:
924            if os.path.exists(path):
925                try:
926                    os.remove(path)
927                except Exception as e:
928                    logger.error(f"Failed to delete temp DOCX image {path}: {e}")

Extract text, tables, and OCR from embedded images in a DOCX file.

Args: file_path (str): Path to the .docx file.

Returns: Optional[str]: Normalized full text content.

def text_from_excel(file_path: str) -> str:
931def text_from_excel(
932    file_path: str
933) -> str:
934    """
935    Convert an Excel workbook to CSV text.
936
937    Args:
938        file_path (str): Path to the Excel file.
939
940    Returns:
941        str: CSV-formatted string.
942    """
943    path = clean_path(file_path)
944    if not path:
945        return ""
946    try:
947        # Get all sheets
948        result = ""
949        excel_file = pd.ExcelFile(path)
950        for sheet_name in excel_file.sheet_names:
951            df = pd.read_excel(path, sheet_name=sheet_name)
952            out = StringIO()
953            df.to_csv(out, index=False)
954            result += f"\n--- Sheet: {sheet_name} ---\n"
955            result += out.getvalue()
956            result += "\n"
957        return result
958    except Exception as e:
959        logger.error(f"Failed Excel -> CSV: {e}")
960        return ""

Convert an Excel workbook to CSV text.

Args: file_path (str): Path to the Excel file.

Returns: str: CSV-formatted string.

def text_from_image(file_path: str) -> Optional[str]:
963def text_from_image(
964    file_path: str
965) -> Optional[str]:
966    """
967    Perform OCR on an image file.
968
969    Args:
970        file_path (str): Path to the image.
971
972    Returns:
973        Optional[str]: Extracted text, or None on error.
974    """
975    path = clean_path(file_path)
976    if not path:
977        return None
978    try:
979        with Image.open(path) as img:
980            # Improve OCR with preprocessing
981            # 1. Convert to grayscale if it's not already
982            if img.mode != 'L':
983                img = img.convert('L')
984                
985            # 2. Optional: Apply some contrast enhancement
986            # (Disabled by default, enable if needed for specific cases)
987            # from PIL import ImageEnhance
988            # enhancer = ImageEnhance.Contrast(img)
989            # img = enhancer.enhance(1.5)  # Increase contrast
990                
991            # Perform OCR with custom configuration
992            custom_config = r'--oem 3 --psm 6'  # Default OCR Engine Mode and Page Segmentation Mode
993            txt = pytesseract.image_to_string(img, config=custom_config).strip()
994            return normalize_text(txt) or ""
995    except Exception as e:
996        logger.error(f"Failed image OCR: {e}")
997        return None

Perform OCR on an image file.

Args: file_path (str): Path to the image.

Returns: Optional[str]: Extracted text, or None on error.

def text_from_any(file_path: str) -> Optional[str]:
1000def text_from_any(
1001    file_path: str
1002) -> Optional[str]:
1003    """
1004    Handle unknown file types by reporting stats and metadata.
1005
1006    Args:
1007        file_path (str): Path to the file.
1008
1009    Returns:
1010        Optional[str]: Plain-text report, or None on error.
1011    """
1012    path = clean_path(file_path)
1013    if not path:
1014        return None
1015    try:
1016        stats = os.stat(path)
1017        info = {
1018            "path": path,
1019            "size": stats.st_size,
1020            "created": datetime.fromtimestamp(stats.st_ctime).isoformat(),
1021            "modified": datetime.fromtimestamp(stats.st_mtime).isoformat(),
1022        }
1023        
1024        # Try to extract EXIF if available
1025        exif = extract_exif(path)
1026        if exif:
1027            info["exif"] = exif
1028            
1029        # Get file hash
1030        md5_hash = hashlib.md5(open(path,'rb').read()).hexdigest()
1031        info["md5"] = md5_hash
1032        
1033        content = "\n".join(f"{k}: {v}" for k, v in info.items() if k != "exif")
1034        
1035        # Add formatted EXIF data if available
1036        if exif:
1037            content += "\n\nEXIF Data:\n"
1038            for k, v in exif.items():
1039                if isinstance(v, dict):
1040                    content += f"\n{k}:\n"
1041                    for sub_k, sub_v in v.items():
1042                        content += f"  {sub_k}: {sub_v}\n"
1043                else:
1044                    content += f"{k}: {v}\n"
1045                    
1046        return normalize_text(content)
1047    except Exception as e:
1048        logger.error(f"Error on other file: {e}")
1049        return None

Handle unknown file types by reporting stats and metadata.

Args: file_path (str): Path to the file.

Returns: Optional[str]: Plain-text report, or None on error.

def text_from_odt(odt_path: str) -> Optional[str]:
1418def text_from_odt(odt_path: str) -> Optional[str]:
1419    """
1420    Extract text from OpenDocument Text files.
1421    
1422    Args:
1423        odt_path (str): Path to the ODT file
1424        
1425    Returns:
1426        Optional[str]: Extracted text
1427    """
1428    try:
1429        from odf import text, teletype
1430        from odf.opendocument import load
1431        
1432        textdoc = load(odt_path)
1433        
1434        # Extract metadata
1435        meta = []
1436        meta_elem = textdoc.meta
1437        if meta_elem:
1438            for prop in meta_elem.childNodes:
1439                if hasattr(prop, 'tagName') and hasattr(prop, 'childNodes') and prop.childNodes:
1440                    meta.append(f"{prop.tagName}: {teletype.extractText(prop)}")
1441        
1442        # Extract content
1443        allparas = textdoc.getElementsByType(text.P)
1444        content = "\n".join(teletype.extractText(p) for p in allparas)
1445        
1446        # Combine metadata and content
1447        if meta:
1448            final_text = "\n".join(meta) + "\n---\n" + content
1449        else:
1450            final_text = content
1451        
1452        return normalize_text(final_text)
1453    except ImportError:
1454        logger.error("odfpy not installed")
1455        return "odfpy package is required for ODT processing"
1456    except Exception as e:
1457        logger.error(f"Error processing ODT: {e}")
1458        return None

Extract text from OpenDocument Text files.

Args: odt_path (str): Path to the ODT file

Returns: Optional[str]: Extracted text

def text_from_pptx(pptx_path: str) -> Optional[str]:
1375def text_from_pptx(pptx_path: str) -> Optional[str]:
1376    """
1377    Extract text from PowerPoint presentations.
1378    
1379    Args:
1380        pptx_path (str): Path to the PowerPoint file
1381        
1382    Returns:
1383        Optional[str]: Extracted text
1384    """
1385    try:
1386        from pptx import Presentation
1387        
1388        prs = Presentation(pptx_path)
1389        text = ["--- PowerPoint Presentation ---"]
1390        
1391        for i, slide in enumerate(prs.slides, 1):
1392            slide_text = [f"Slide {i}:"]
1393            
1394            # Get slide title if it exists
1395            if slide.shapes.title and slide.shapes.title.text:
1396                slide_text.append(f"Title: {slide.shapes.title.text}")
1397            
1398            # Extract text from all shapes
1399            shape_text = []
1400            for shape in slide.shapes:
1401                if hasattr(shape, "text") and shape.text:
1402                    shape_text.append(shape.text)
1403            
1404            if shape_text:
1405                slide_text.append("\n".join(shape_text))
1406            
1407            text.append("\n".join(slide_text))
1408        
1409        return normalize_text("\n\n".join(text))
1410    except ImportError:
1411        logger.error("python-pptx not installed")
1412        return "python-pptx package is required for PowerPoint processing"
1413    except Exception as e:
1414        logger.error(f"Error processing PowerPoint: {e}")
1415        return None

Extract text from PowerPoint presentations.

Args: pptx_path (str): Path to the PowerPoint file

Returns: Optional[str]: Extracted text

def text_from_epub(epub_path: str) -> Optional[str]:
1328def text_from_epub(epub_path: str) -> Optional[str]:
1329    """
1330    Extract text from EPUB ebooks.
1331    
1332    Args:
1333        epub_path (str): Path to the EPUB file
1334        
1335    Returns:
1336        Optional[str]: Extracted text
1337    """
1338    try:
1339        from ebooklib import epub
1340        import html2text
1341        
1342        book = epub.read_epub(epub_path)
1343        h = html2text.HTML2Text()
1344        h.ignore_links = False
1345        
1346        content = []
1347        
1348        # Get book metadata
1349        metadata = []
1350        if book.get_metadata('DC', 'title'):
1351            metadata.append(f"Title: {book.get_metadata('DC', 'title')[0][0]}")
1352        if book.get_metadata('DC', 'creator'):
1353            metadata.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
1354        if book.get_metadata('DC', 'description'):
1355            metadata.append(f"Description: {book.get_metadata('DC', 'description')[0][0]}")
1356        
1357        if metadata:
1358            content.append("\n".join(metadata))
1359            content.append("---")
1360        
1361        # Get book content
1362        for item in book.get_items():
1363            if item.get_type() == epub.ITEM_DOCUMENT:
1364                content.append(h.handle(item.get_content().decode('utf-8')))
1365        
1366        return normalize_text("\n".join(content))
1367    except ImportError:
1368        logger.error("ebooklib and/or html2text not installed")
1369        return "ebooklib and/or html2text packages are required for EPUB processing"
1370    except Exception as e:
1371        logger.error(f"Error processing EPUB: {e}")
1372        return None

Extract text from EPUB ebooks.

Args: epub_path (str): Path to the EPUB file

Returns: Optional[str]: Extracted text

def analyze_text(text: str) -> Dict[str, Any]:
1284def analyze_text(text: str) -> Dict[str, Any]:
1285    """
1286    Perform basic text analytics.
1287    
1288    Args:
1289        text (str): Input text
1290        
1291    Returns:
1292        Dict: Analysis results
1293    """
1294    try:
1295        # Tokenize text
1296        words = nltk.word_tokenize(text.lower())
1297        sentences = nltk.sent_tokenize(text)
1298        
1299        # Filter out punctuation
1300        words = [word for word in words if word.isalpha()]
1301        
1302        # Count word frequencies
1303        word_freq = Counter(words)
1304        
1305        # Calculate readability metrics
1306        avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
1307        avg_sent_length = len(words) / len(sentences) if sentences else 0
1308        
1309        # Detect language
1310        language = detect_language(text)
1311        
1312        return {
1313            "word_count": len(words),
1314            "sentence_count": len(sentences),
1315            "unique_words": len(set(words)),
1316            "avg_word_length": avg_word_length,
1317            "avg_sentence_length": avg_sent_length,
1318            "language": language,
1319            "most_common_words": word_freq.most_common(20)
1320        }
1321    except Exception as e:
1322        logger.error(f"Text analysis error: {e}")
1323        return {"error": str(e)}

Perform basic text analytics.

Args: text (str): Input text

Returns: Dict: Analysis results

def summarize_text(text: str, sentences: int = 5) -> str:
1216def summarize_text(text: str, sentences: int = 5) -> str:
1217    """
1218    Create a simple extractive summary from the text.
1219    
1220    Args:
1221        text (str): Input text to summarize
1222        sentences (int): Number of sentences to include
1223        
1224    Returns:
1225        str: Summarized text
1226    """
1227    try:
1228        from nltk.corpus import stopwords
1229        from nltk.tokenize import sent_tokenize
1230        
1231        # Download required NLTK data if not already present
1232        try:
1233            nltk.data.find('tokenizers/punkt')
1234        except LookupError:
1235            nltk.download('punkt', quiet=True)
1236        try:
1237            nltk.data.find('corpora/stopwords')
1238        except LookupError:
1239            nltk.download('stopwords', quiet=True)
1240        
1241        # Tokenize and calculate word frequencies
1242        stop_words = set(stopwords.words('english'))
1243        sentences_list = sent_tokenize(text)
1244        
1245        # If there are fewer sentences than requested, return all
1246        if len(sentences_list) <= sentences:
1247            return text
1248        
1249        word_frequencies = {}
1250        for sentence in sentences_list:
1251            for word in nltk.word_tokenize(sentence):
1252                word = word.lower()
1253                if word not in stop_words:
1254                    if word not in word_frequencies:
1255                        word_frequencies[word] = 1
1256                    else:
1257                        word_frequencies[word] += 1
1258        
1259        # Normalize frequencies
1260        maximum_frequency = max(word_frequencies.values()) if word_frequencies else 1
1261        for word in word_frequencies:
1262            word_frequencies[word] = word_frequencies[word] / maximum_frequency
1263        
1264        # Score sentences
1265        sentence_scores = {}
1266        for i, sentence in enumerate(sentences_list):
1267            for word in nltk.word_tokenize(sentence.lower()):
1268                if word in word_frequencies:
1269                    if i not in sentence_scores:
1270                        sentence_scores[i] = word_frequencies[word]
1271                    else:
1272                        sentence_scores[i] += word_frequencies[word]
1273        
1274        # Get top N sentences
1275        summary_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)[:sentences]
1276        summary_sentences = [sentences_list[i] for i, _ in sorted(summary_sentences)]
1277        
1278        return ' '.join(summary_sentences)
1279    except Exception as e:
1280        logger.error(f"Summarization error: {e}")
1281        return text

Create a simple extractive summary from the text.

Args: text (str): Input text to summarize sentences (int): Number of sentences to include

Returns: str: Summarized text

def translate_text(text: str, target_lang: str = 'en') -> Optional[str]:
1162def translate_text(text: str, target_lang: str = "en") -> Optional[str]:
1163    """
1164    Translate text to target language.
1165    
1166    Args:
1167        text (str): Input text to translate
1168        target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese)
1169        
1170    Returns:
1171        Optional[str]: Translated text or None on failure
1172    """
1173    try:
1174        # Use a more stable translation library
1175        # Note: googletrans 4.0.0-rc1 uses async methods which need to be awaited
1176        # Let's use the deep-translator library instead which is more stable
1177        from deep_translator import GoogleTranslator
1178        
1179        # Handle long texts by splitting into chunks (Google has a limit)
1180        max_chunk_size = 4500  # Google Translate has a limit around 5000 chars
1181        chunks = []
1182        
1183        # Split text into chunks of appropriate size (at sentence boundaries if possible)
1184        text_remaining = text
1185        while len(text_remaining) > 0:
1186            if len(text_remaining) <= max_chunk_size:
1187                chunks.append(text_remaining)
1188                break
1189                
1190            # Try to find a sentence boundary near the max chunk size
1191            chunk_end = max_chunk_size
1192            while chunk_end > 0 and text_remaining[chunk_end] not in ['.', '!', '?', '\n']:
1193                chunk_end -= 1
1194                
1195            # If no good sentence boundary found, just use max size
1196            if chunk_end == 0:
1197                chunk_end = max_chunk_size
1198            else:
1199                chunk_end += 1  # Include the period or boundary character
1200                
1201            chunks.append(text_remaining[:chunk_end])
1202            text_remaining = text_remaining[chunk_end:]
1203            
1204        # Translate each chunk and combine
1205        translated_chunks = []
1206        for chunk in chunks:
1207            translated_chunk = GoogleTranslator(source='auto', target=target_lang).translate(chunk)
1208            translated_chunks.append(translated_chunk)
1209            
1210        return ' '.join(translated_chunks)
1211    except Exception as e:
1212        logger.error(f"Translation error: {e}")
1213        return None

Translate text to target language.

Args: text (str): Input text to translate target_lang (str): Target language code (e.g., 'en', 'es', 'fr', 'ja' for Japanese)

Returns: Optional[str]: Translated text or None on failure

def list_available_languages() -> Dict[str, str]:
1134def list_available_languages() -> Dict[str, str]:
1135    """
1136    Get a dictionary of available languages for translation.
1137
1138    Returns:
1139        Dict[str, str]: Dictionary mapping language codes to language names
1140    """
1141    try:
1142        from deep_translator import GoogleTranslator
1143        # Get available languages from the translator
1144        languages = GoogleTranslator().get_supported_languages(as_dict=True)
1145        return languages
1146    except Exception as e:
1147        logger.error(f"Error getting language list: {e}")
1148        # Return a small subset as fallback
1149        return {
1150            "en": "English",
1151            "es": "Spanish",
1152            "fr": "French",
1153            "de": "German",
1154            "it": "Italian",
1155            "ja": "Japanese",
1156            "ko": "Korean",
1157            "zh-cn": "Chinese (Simplified)",
1158            "ru": "Russian",
1159            "ar": "Arabic"
1160        }

Get a dictionary of available languages for translation.

Returns: Dict[str, str]: Dictionary mapping language codes to language names

def detect_language(text: str) -> str:
1117def detect_language(text: str) -> str:
1118    """
1119    Detect the language of the extracted text.
1120    
1121    Args:
1122        text (str): Input text
1123        
1124    Returns:
1125        str: Detected language code or 'unknown'
1126    """
1127    try:
1128        import langdetect
1129        return langdetect.detect(text)
1130    except:
1131        logger.warning("Language detection failed or langdetect not installed")
1132        return "unknown"

Detect the language of the extracted text.

Args: text (str): Input text

Returns: str: Detected language code or 'unknown'

def scrape_website( url: str, max_pages: int = 1, stay_on_domain: bool = True) -> Dict[str, str]:
365def scrape_website(url: str, max_pages: int = 1, stay_on_domain: bool = True) -> Dict[str, str]:
366    """
367    Scrape multiple pages of a website.
368    
369    Args:
370        url (str): Starting URL
371        max_pages (int): Maximum pages to scrape
372        stay_on_domain (bool): Whether to stay on the same domain
373        
374    Returns:
375        Dict[str, str]: Dictionary mapping URLs to extracted text
376    """
377    results = {}
378    visited = set()
379    to_visit = [url]
380    base_domain = urlparse(url).netloc
381    
382    while to_visit and len(visited) < max_pages:
383        current_url = to_visit.pop(0)
384        if current_url in visited:
385            continue
386            
387        # Extract text from current page
388        text = text_from_url(current_url)
389        if text:
390            results[current_url] = text
391            
392        visited.add(current_url)
393        
394        # Find links on the page
395        session = HTMLSession()
396        try:
397            r = session.get(current_url)
398            r.html.render(timeout=20, sleep=1)
399            
400            links = r.html.absolute_links
401            for link in links:
402                link_domain = urlparse(link).netloc
403                if link not in visited and link not in to_visit:
404                    # Check if we should follow this link
405                    if stay_on_domain and link_domain != base_domain:
406                        continue
407                    to_visit.append(link)
408        except Exception as e:
409            logger.error(f"Error scraping {current_url}: {e}")
410        finally:
411            session.close()
412    
413    return results

Scrape multiple pages of a website.

Args: url (str): Starting URL max_pages (int): Maximum pages to scrape stay_on_domain (bool): Whether to stay on the same domain

Returns: Dict[str, str]: Dictionary mapping URLs to extracted text

def normalize_text(text: str) -> str:
124def normalize_text(
125    text: str
126) -> str:
127    """
128    Replace multiple consecutive newlines, carriage returns, and spaces
129    with a single space. Ensures compact, single-line output.
130
131    Args:
132        text (str): Raw input text.
133
134    Returns:
135        str: Normalized single-line text.
136    """
137    if not text:
138        return ""
139    text = unicodedata.normalize("NFKC", text)
140    text = re.sub(r' +', ' ', text)
141    text = re.sub(r'\n+', '\n', text)
142    text = re.sub(r'(?m)(^ \n)+', '\n', text)
143    text = re.sub(r'\t+', '\t', text)
144    text = re.sub(r'\r+', '\n', text)
145    text = re.sub(r"^ ", "", text, flags=re.MULTILINE)
146    return text 

Replace multiple consecutive newlines, carriage returns, and spaces with a single space. Ensures compact, single-line output.

Args: text (str): Raw input text.

Returns: str: Normalized single-line text.