Module scrapetools.phone_scraper

Expand source code
import phonenumbers


def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
    """Finds the number of consecutive numeric characters in a string."""
    # limit search to 10 characters
    text[:10]
    if reverse:
        text = text[::-1]
    for i, ch in enumerate(text):
        if not ch.isnumeric():
            return i
    return len(text)


def find_by_separator(text: str, separator: str) -> list[str]:
    """Attempts to detect phone numbers according to these
    patterns by scanning for separators (typically '-.')
    and how many consecutive numbers follow or precede them:

    (xxx)xxx{separator}xxxx

    (xxx) xxx{separator}xxxx

    (xxx){separator}xxx{separator}xxxx

    xxx{separator}xxx{separator}xxxx"""
    count = text.count(separator)
    numbers = []
    if count > 0:
        last_stopdex = 0
        for _ in range(count):
            number = ""
            sepdex = text.find(separator, last_stopdex)
            if sepdex != -1:
                next_sepdex = text.find(separator, sepdex + 1)
                # consecutive numbers preceding sepdex
                start_offset = get_num_consecutive_numbers(
                    text[last_stopdex:sepdex], reverse=True
                )
                # consecutive numbers between sepdex and next_sepdex
                first_stop_offset = get_num_consecutive_numbers(
                    text[sepdex + 1 : next_sepdex + 1]
                )
                # consecutive numbers after next_sepdex
                second_stop_offset = get_num_consecutive_numbers(
                    text[next_sepdex + 1 :]
                )

                if (
                    start_offset == 3
                    and first_stop_offset == 3
                    and second_stop_offset == 4
                ):
                    # xxx{separator}xxx{separator}xxxx
                    number = text[
                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
                    ]
                elif (
                    start_offset == 0
                    and first_stop_offset == 3
                    and second_stop_offset == 4
                    and text[sepdex - 1] == ")"
                    and text[sepdex - 5] == "("
                ):
                    # (xxx){separator}xxx{separator}xxxx
                    number = text[
                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
                    ]
                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
                    number = text[sepdex - 8 : sepdex + 5]
                last_stopdex = sepdex + 5
                for ch in [separator, "(", ")", " "]:
                    number = number.replace(ch, "")
                if len(number) == 10 and all(ch.isnumeric() for ch in number):
                    numbers.append(number)
    return numbers


def find_by_href(text: str) -> list[str]:
    """Scrapes phone numbers by href attribute."""
    indicator = 'href="'
    count = text.count(indicator)
    prefixes = ["tel:", "callto:"]
    index = 0
    numbers = []
    for _ in range(count):
        index = text.find(indicator, index + 1)
        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
        if any(prefix in number for prefix in prefixes):
            number = "".join(
                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
            )
            if len(number) == 10:
                numbers.append(number)
    return numbers


def scrape_phone_numbers(text: str) -> list[str]:
    """Scrape for u.s. phone numbers."""
    numbers = []
    text = text.replace("+1", "")
    for separator in "-.":
        numbers.extend(find_by_separator(text, separator))
    numbers.extend(find_by_href(text))
    numbers = [
        number
        for number in numbers
        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
    ]
    numbers = sorted(list(set(numbers)))
    return numbers

Functions

def find_by_href(text: str) ‑> list[str]

Scrapes phone numbers by href attribute.

Expand source code
def find_by_href(text: str) -> list[str]:
    """Scrapes phone numbers by href attribute."""
    indicator = 'href="'
    count = text.count(indicator)
    prefixes = ["tel:", "callto:"]
    index = 0
    numbers = []
    for _ in range(count):
        index = text.find(indicator, index + 1)
        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
        if any(prefix in number for prefix in prefixes):
            number = "".join(
                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
            )
            if len(number) == 10:
                numbers.append(number)
    return numbers
def find_by_separator(text: str, separator: str) ‑> list[str]

Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:

(xxx)xxx{separator}xxxx

(xxx) xxx{separator}xxxx

(xxx){separator}xxx{separator}xxxx

xxx{separator}xxx{separator}xxxx

Expand source code
def find_by_separator(text: str, separator: str) -> list[str]:
    """Attempts to detect phone numbers according to these
    patterns by scanning for separators (typically '-.')
    and how many consecutive numbers follow or precede them:

    (xxx)xxx{separator}xxxx

    (xxx) xxx{separator}xxxx

    (xxx){separator}xxx{separator}xxxx

    xxx{separator}xxx{separator}xxxx"""
    count = text.count(separator)
    numbers = []
    if count > 0:
        last_stopdex = 0
        for _ in range(count):
            number = ""
            sepdex = text.find(separator, last_stopdex)
            if sepdex != -1:
                next_sepdex = text.find(separator, sepdex + 1)
                # consecutive numbers preceding sepdex
                start_offset = get_num_consecutive_numbers(
                    text[last_stopdex:sepdex], reverse=True
                )
                # consecutive numbers between sepdex and next_sepdex
                first_stop_offset = get_num_consecutive_numbers(
                    text[sepdex + 1 : next_sepdex + 1]
                )
                # consecutive numbers after next_sepdex
                second_stop_offset = get_num_consecutive_numbers(
                    text[next_sepdex + 1 :]
                )

                if (
                    start_offset == 3
                    and first_stop_offset == 3
                    and second_stop_offset == 4
                ):
                    # xxx{separator}xxx{separator}xxxx
                    number = text[
                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
                    ]
                elif (
                    start_offset == 0
                    and first_stop_offset == 3
                    and second_stop_offset == 4
                    and text[sepdex - 1] == ")"
                    and text[sepdex - 5] == "("
                ):
                    # (xxx){separator}xxx{separator}xxxx
                    number = text[
                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
                    ]
                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
                    number = text[sepdex - 8 : sepdex + 5]
                last_stopdex = sepdex + 5
                for ch in [separator, "(", ")", " "]:
                    number = number.replace(ch, "")
                if len(number) == 10 and all(ch.isnumeric() for ch in number):
                    numbers.append(number)
    return numbers
def get_num_consecutive_numbers(text: str, reverse: bool = False) ‑> int

Finds the number of consecutive numeric characters in a string.

Expand source code
def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
    """Finds the number of consecutive numeric characters in a string."""
    # limit search to 10 characters
    text[:10]
    if reverse:
        text = text[::-1]
    for i, ch in enumerate(text):
        if not ch.isnumeric():
            return i
    return len(text)
def scrape_phone_numbers(text: str) ‑> list[str]

Scrape for u.s. phone numbers.

Expand source code
def scrape_phone_numbers(text: str) -> list[str]:
    """Scrape for u.s. phone numbers."""
    numbers = []
    text = text.replace("+1", "")
    for separator in "-.":
        numbers.extend(find_by_separator(text, separator))
    numbers.extend(find_by_href(text))
    numbers = [
        number
        for number in numbers
        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
    ]
    numbers = sorted(list(set(numbers)))
    return numbers