Module scrapetools.email_scraper

Expand source code
from string import printable
from urllib.parse import unquote


def validate(email: str) -> bool:
    """Checks string to see if it's likely an email address.

    Returns True or False.

    Some emails violating some of these rules
    may technically be valid, but are practically
    never seen in use out in the wild."""
    if email.count("@") != 1 or email.count(".") == 0:
        return False
    atdex = email.find("@")
    last_dot = email.rfind(".")
    local, domain = email.split("@")
    # RULES:
    #'@' comes before the last '.'
    # local part is 64 characters or less
    # domain part doesn't contain any '_'
    # at least 1 character in local is alphabetical
    # 1st character is not '@' or '.'
    # last character is not '@' or '.'
    # character after '@' is not '.'
    # doesn't start with 'www.'
    # local is two or more characters
    # domain is more than 3 characters
    # domain doesn't consist of only numbers
    # local doesn't consist of only numbers
    # no consecutive '.' in email
    # email doesn't contain a listed file ext
    if all(
        [
            atdex < last_dot,
            len(local) <= 64,
            domain.count("_") == 0,
            any(ch.isalpha() for ch in local),
            email[0] not in ["@", "."],
            email[-1] not in ["@", "."],
            email[email.find("@") + 1] != ".",
            not email.startswith("www."),
            len(local) >= 2,
            len(domain) > 3,
            not all(ch.isnumeric() for ch in domain.replace(".", "")),
            not all(ch.isnumeric() for ch in local.replace(".", "")),
            all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."),
            all(
                ext not in domain
                for ext in [
                    ".png",
                    ".jpg",
                    ".js",
                    ".html",
                    ".svg",
                    ".jpeg",
                    ".mp4",
                    ".mpeg",
                    ".css",
                    ".pdf",
                    ".wav",
                    ".docx",
                    ".txt",
                    ".rtf",
                    ".gif",
                    ".webp",
                    ".x.x",
                ]
            ),
        ]
    ):
        return True
    else:
        return False


def find_last_valid_character_offset(text: str) -> int:
    """Iterates through a string to find the index of the last valid character,
    assuming that string either starts or ends with '@'.

    If the string doesn't start or end with '@', an Exception is raised.

    Returns the number of valid characters between '@' and first invalid character.
    e.g. '@abcde%' will return 5 and '#123@' will return 3.

    If no invalid characters are found, the function will return
    'len(text)-1'."""

    """ Technically some of these characters are valid in an email string,
    but the ratio of how often they're used to how often they produce
    false positives makes them worth disregarding. """
    invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r"
    if text[-1] == "@" and text[0] != "@":
        # reverse the string
        text = text[::-1]
    elif text[0] != "@":
        raise ValueError(
            'First or last character of text arg needs to be "@"\n',
            f"Argument {text} is invalid.",
        )
    i = 1
    while i < len(text):
        if text[i] in invalid_characters or text[i] not in printable:
            return i - 1
        else:
            i += 1
    return len(text) - 1


def strip_unicode(emails: list[str]) -> list[str]:
    """Removes unicode text that often gets picked
    up at the front of email addresses and returns the list."""
    stripped_emails = []
    for email in emails:
        for text in ["u003e", "u00a0"]:
            if text in email:
                email = email[len(text) + 1 :]
        stripped_emails.append(email)
    return stripped_emails


def scrape_emails(text: str) -> list[str]:
    """Extracts potential emails from given text
    and returns as a list of strings."""
    if "%" in text:
        # decode percent encoding
        text = unquote(text)
    for ch in ["\n", "\t", "\r"]:
        text = text.replace(ch, " ")
    at_count = text.count("@")
    emails = []
    if at_count > 0:
        last_stopdex = 0
        for i in range(at_count):
            atdex = text.find("@", last_stopdex)
            next_atdex = text.find("@", atdex + 1)
            try:
                chunk = (
                    text[last_stopdex:next_atdex]
                    if next_atdex != -1
                    else text[last_stopdex:]
                )
                chunk_atdex = chunk.find("@")
                startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1])
                stopdex = find_last_valid_character_offset(chunk[chunk_atdex:])
                email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1]
                while email[-1].isnumeric() or not email[-1].isalpha():
                    email = email[:-1]
                if validate(email):
                    emails.append(email.lower())
                """ The extra '+ 1' is to ensure last_stopdex increments
                if 'len(email.split('@')[1])' is 0."""
                last_stopdex = atdex + len(email.split("@")[1]) + 1
            except Exception as e:
                last_stopdex = atdex + 1
        emails = sorted(list(set(strip_unicode(emails))))
    return emails

Functions

def find_last_valid_character_offset(text: str) ‑> int

Iterates through a string to find the index of the last valid character, assuming that string either starts or ends with '@'.

If the string doesn't start or end with '@', an Exception is raised.

Returns the number of valid characters between '@' and first invalid character. e.g. '@abcde%' will return 5 and '#123@' will return 3.

If no invalid characters are found, the function will return 'len(text)-1'.

Expand source code
def find_last_valid_character_offset(text: str) -> int:
    """Iterates through a string to find the index of the last valid character,
    assuming that string either starts or ends with '@'.

    If the string doesn't start or end with '@', an Exception is raised.

    Returns the number of valid characters between '@' and first invalid character.
    e.g. '@abcde%' will return 5 and '#123@' will return 3.

    If no invalid characters are found, the function will return
    'len(text)-1'."""

    """ Technically some of these characters are valid in an email string,
    but the ratio of how often they're used to how often they produce
    false positives makes them worth disregarding. """
    invalid_characters = " <>[]{},\"':;\\/#$%^&*()=+`?|\n\t\r"
    if text[-1] == "@" and text[0] != "@":
        # reverse the string
        text = text[::-1]
    elif text[0] != "@":
        raise ValueError(
            'First or last character of text arg needs to be "@"\n',
            f"Argument {text} is invalid.",
        )
    i = 1
    while i < len(text):
        if text[i] in invalid_characters or text[i] not in printable:
            return i - 1
        else:
            i += 1
    return len(text) - 1
def scrape_emails(text: str) ‑> list[str]

Extracts potential emails from given text and returns as a list of strings.

Expand source code
def scrape_emails(text: str) -> list[str]:
    """Extracts potential emails from given text
    and returns as a list of strings."""
    if "%" in text:
        # decode percent encoding
        text = unquote(text)
    for ch in ["\n", "\t", "\r"]:
        text = text.replace(ch, " ")
    at_count = text.count("@")
    emails = []
    if at_count > 0:
        last_stopdex = 0
        for i in range(at_count):
            atdex = text.find("@", last_stopdex)
            next_atdex = text.find("@", atdex + 1)
            try:
                chunk = (
                    text[last_stopdex:next_atdex]
                    if next_atdex != -1
                    else text[last_stopdex:]
                )
                chunk_atdex = chunk.find("@")
                startdex = find_last_valid_character_offset(chunk[: chunk_atdex + 1])
                stopdex = find_last_valid_character_offset(chunk[chunk_atdex:])
                email = chunk[chunk_atdex - startdex : stopdex + chunk_atdex + 1]
                while email[-1].isnumeric() or not email[-1].isalpha():
                    email = email[:-1]
                if validate(email):
                    emails.append(email.lower())
                """ The extra '+ 1' is to ensure last_stopdex increments
                if 'len(email.split('@')[1])' is 0."""
                last_stopdex = atdex + len(email.split("@")[1]) + 1
            except Exception as e:
                last_stopdex = atdex + 1
        emails = sorted(list(set(strip_unicode(emails))))
    return emails
def strip_unicode(emails: list[str]) ‑> list[str]

Removes unicode text that often gets picked up at the front of email addresses and returns the list.

Expand source code
def strip_unicode(emails: list[str]) -> list[str]:
    """Removes unicode text that often gets picked
    up at the front of email addresses and returns the list."""
    stripped_emails = []
    for email in emails:
        for text in ["u003e", "u00a0"]:
            if text in email:
                email = email[len(text) + 1 :]
        stripped_emails.append(email)
    return stripped_emails
def validate(email: str) ‑> bool

Checks string to see if it's likely an email address.

Returns True or False.

Some emails violating some of these rules may technically be valid, but are practically never seen in use out in the wild.

Expand source code
def validate(email: str) -> bool:
    """Checks string to see if it's likely an email address.

    Returns True or False.

    Some emails violating some of these rules
    may technically be valid, but are practically
    never seen in use out in the wild."""
    if email.count("@") != 1 or email.count(".") == 0:
        return False
    atdex = email.find("@")
    last_dot = email.rfind(".")
    local, domain = email.split("@")
    # RULES:
    #'@' comes before the last '.'
    # local part is 64 characters or less
    # domain part doesn't contain any '_'
    # at least 1 character in local is alphabetical
    # 1st character is not '@' or '.'
    # last character is not '@' or '.'
    # character after '@' is not '.'
    # doesn't start with 'www.'
    # local is two or more characters
    # domain is more than 3 characters
    # domain doesn't consist of only numbers
    # local doesn't consist of only numbers
    # no consecutive '.' in email
    # email doesn't contain a listed file ext
    if all(
        [
            atdex < last_dot,
            len(local) <= 64,
            domain.count("_") == 0,
            any(ch.isalpha() for ch in local),
            email[0] not in ["@", "."],
            email[-1] not in ["@", "."],
            email[email.find("@") + 1] != ".",
            not email.startswith("www."),
            len(local) >= 2,
            len(domain) > 3,
            not all(ch.isnumeric() for ch in domain.replace(".", "")),
            not all(ch.isnumeric() for ch in local.replace(".", "")),
            all(email[i - 1] != "." for i, ch in enumerate(email) if ch == "."),
            all(
                ext not in domain
                for ext in [
                    ".png",
                    ".jpg",
                    ".js",
                    ".html",
                    ".svg",
                    ".jpeg",
                    ".mp4",
                    ".mpeg",
                    ".css",
                    ".pdf",
                    ".wav",
                    ".docx",
                    ".txt",
                    ".rtf",
                    ".gif",
                    ".webp",
                    ".x.x",
                ]
            ),
        ]
    ):
        return True
    else:
        return False