scrapetools.phone_scraper

  1import re
  2
  3import phonenumbers
  4
  5
  6def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
  7    """Finds the number of consecutive numeric characters in a string."""
  8    # limit search to 10 characters
  9    text[:10]
 10    if reverse:
 11        text = text[::-1]
 12    for i, ch in enumerate(text):
 13        if not ch.isnumeric():
 14            return i
 15    return len(text)
 16
 17
 18def find_by_separator(text: str, separator: str) -> list[str]:
 19    """Attempts to detect phone numbers according to these
 20    patterns by scanning for separators (typically '-.')
 21    and how many consecutive numbers follow or precede them:
 22
 23    (xxx)xxx{separator}xxxx
 24
 25    (xxx) xxx{separator}xxxx
 26
 27    (xxx){separator}xxx{separator}xxxx
 28
 29    xxx{separator}xxx{separator}xxxx"""
 30    count = text.count(separator)
 31    numbers = []
 32    if count > 0:
 33        last_stopdex = 0
 34        for _ in range(count):
 35            number = ""
 36            sepdex = text.find(separator, last_stopdex)
 37            if sepdex != -1:
 38                next_sepdex = text.find(separator, sepdex + 1)
 39                # consecutive numbers preceding sepdex
 40                start_offset = get_num_consecutive_numbers(
 41                    text[last_stopdex:sepdex], reverse=True
 42                )
 43                # consecutive numbers between sepdex and next_sepdex
 44                first_stop_offset = get_num_consecutive_numbers(
 45                    text[sepdex + 1 : next_sepdex + 1]
 46                )
 47                # consecutive numbers after next_sepdex
 48                second_stop_offset = get_num_consecutive_numbers(
 49                    text[next_sepdex + 1 :]
 50                )
 51
 52                if (
 53                    start_offset == 3
 54                    and first_stop_offset == 3
 55                    and second_stop_offset == 4
 56                ):
 57                    # xxx{separator}xxx{separator}xxxx
 58                    number = text[
 59                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
 60                    ]
 61                elif (
 62                    start_offset == 0
 63                    and first_stop_offset == 3
 64                    and second_stop_offset == 4
 65                    and text[sepdex - 1] == ")"
 66                    and text[sepdex - 5] == "("
 67                ):
 68                    # (xxx){separator}xxx{separator}xxxx
 69                    number = text[
 70                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
 71                    ]
 72                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
 73                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
 74                    number = text[sepdex - 8 : sepdex + 5]
 75                last_stopdex = sepdex + 5
 76                for ch in [separator, "(", ")", " "]:
 77                    number = number.replace(ch, "")
 78                if len(number) == 10 and all(ch.isnumeric() for ch in number):
 79                    numbers.append(number)
 80    return numbers
 81
 82
 83def find_by_href(text: str) -> list[str]:
 84    """Scrapes phone numbers by href attribute."""
 85    indicator = 'href="'
 86    count = text.count(indicator)
 87    prefixes = ["tel:", "callto:"]
 88    index = 0
 89    numbers = []
 90    for _ in range(count):
 91        index = text.find(indicator, index + 1)
 92        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
 93        if any(prefix in number for prefix in prefixes):
 94            number = "".join(
 95                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
 96            )
 97            if len(number) == 10:
 98                numbers.append(number)
 99    return numbers
100
101
102def scrape_phone_numbers_noregex(text: str) -> list[str]:
103    """Scrape for u.s. phone numbers."""
104    numbers = []
105    text = text.replace("+1", "")
106    for separator in "-.":
107        numbers.extend(find_by_separator(text, separator))
108    numbers.extend(find_by_href(text))
109    numbers = [
110        number
111        for number in numbers
112        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
113    ]
114    numbers = sorted(list(set(numbers)))
115    return numbers
116
117
118def scrape_phone_numbers(text: str) -> list[str]:
119    """Scrape phone numbers from text using regex."""
120    # Validation:
121    # Not preceeded by an alphanumeric character and not followed by a numeric character
122    # to avoid number strings in long urls and floats etc.
123    # One or zero '(' characters followed by a number between 1 and 9.
124    # Followed by two numbers between 0 and 9.
125    # Followed by one or zero ')' characters.
126    # Followed by one or zero ' ', '.', or '-' characters.
127    # Followed by one number between 1 and 9.
128    # Followed by two numbers between 0 and 9.
129    # Followed by one or zero ' ', '.', or '-' characters.
130    # Followed by four numbers between 0 and 9.
131    pattern = r"(?<![0-9a-zA-Z])([(]?[1-9]{1}[0-9]{2}[)]?[ .-]?[1-9]{1}[0-9]{2}[ .-]?[0-9]{4})(?![0-9])"
132    numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)]
133    numbers = [
134        number
135        for number in numbers
136        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
137    ]
138    return sorted(set(numbers))
def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
 7def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
 8    """Finds the number of consecutive numeric characters in a string."""
 9    # limit search to 10 characters
10    text[:10]
11    if reverse:
12        text = text[::-1]
13    for i, ch in enumerate(text):
14        if not ch.isnumeric():
15            return i
16    return len(text)

Finds the number of consecutive numeric characters in a string.

def find_by_separator(text: str, separator: str) -> list[str]:
19def find_by_separator(text: str, separator: str) -> list[str]:
20    """Attempts to detect phone numbers according to these
21    patterns by scanning for separators (typically '-.')
22    and how many consecutive numbers follow or precede them:
23
24    (xxx)xxx{separator}xxxx
25
26    (xxx) xxx{separator}xxxx
27
28    (xxx){separator}xxx{separator}xxxx
29
30    xxx{separator}xxx{separator}xxxx"""
31    count = text.count(separator)
32    numbers = []
33    if count > 0:
34        last_stopdex = 0
35        for _ in range(count):
36            number = ""
37            sepdex = text.find(separator, last_stopdex)
38            if sepdex != -1:
39                next_sepdex = text.find(separator, sepdex + 1)
40                # consecutive numbers preceding sepdex
41                start_offset = get_num_consecutive_numbers(
42                    text[last_stopdex:sepdex], reverse=True
43                )
44                # consecutive numbers between sepdex and next_sepdex
45                first_stop_offset = get_num_consecutive_numbers(
46                    text[sepdex + 1 : next_sepdex + 1]
47                )
48                # consecutive numbers after next_sepdex
49                second_stop_offset = get_num_consecutive_numbers(
50                    text[next_sepdex + 1 :]
51                )
52
53                if (
54                    start_offset == 3
55                    and first_stop_offset == 3
56                    and second_stop_offset == 4
57                ):
58                    # xxx{separator}xxx{separator}xxxx
59                    number = text[
60                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
61                    ]
62                elif (
63                    start_offset == 0
64                    and first_stop_offset == 3
65                    and second_stop_offset == 4
66                    and text[sepdex - 1] == ")"
67                    and text[sepdex - 5] == "("
68                ):
69                    # (xxx){separator}xxx{separator}xxxx
70                    number = text[
71                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
72                    ]
73                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
74                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
75                    number = text[sepdex - 8 : sepdex + 5]
76                last_stopdex = sepdex + 5
77                for ch in [separator, "(", ")", " "]:
78                    number = number.replace(ch, "")
79                if len(number) == 10 and all(ch.isnumeric() for ch in number):
80                    numbers.append(number)
81    return numbers

Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:

(xxx)xxx{separator}xxxx

(xxx) xxx{separator}xxxx

(xxx){separator}xxx{separator}xxxx

xxx{separator}xxx{separator}xxxx

def find_by_href(text: str) -> list[str]:
 84def find_by_href(text: str) -> list[str]:
 85    """Scrapes phone numbers by href attribute."""
 86    indicator = 'href="'
 87    count = text.count(indicator)
 88    prefixes = ["tel:", "callto:"]
 89    index = 0
 90    numbers = []
 91    for _ in range(count):
 92        index = text.find(indicator, index + 1)
 93        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
 94        if any(prefix in number for prefix in prefixes):
 95            number = "".join(
 96                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
 97            )
 98            if len(number) == 10:
 99                numbers.append(number)
100    return numbers

Scrapes phone numbers by href attribute.

def scrape_phone_numbers_noregex(text: str) -> list[str]:
103def scrape_phone_numbers_noregex(text: str) -> list[str]:
104    """Scrape for u.s. phone numbers."""
105    numbers = []
106    text = text.replace("+1", "")
107    for separator in "-.":
108        numbers.extend(find_by_separator(text, separator))
109    numbers.extend(find_by_href(text))
110    numbers = [
111        number
112        for number in numbers
113        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
114    ]
115    numbers = sorted(list(set(numbers)))
116    return numbers

Scrape for u.s. phone numbers.

def scrape_phone_numbers(text: str) -> list[str]:
119def scrape_phone_numbers(text: str) -> list[str]:
120    """Scrape phone numbers from text using regex."""
121    # Validation:
122    # Not preceeded by an alphanumeric character and not followed by a numeric character
123    # to avoid number strings in long urls and floats etc.
124    # One or zero '(' characters followed by a number between 1 and 9.
125    # Followed by two numbers between 0 and 9.
126    # Followed by one or zero ')' characters.
127    # Followed by one or zero ' ', '.', or '-' characters.
128    # Followed by one number between 1 and 9.
129    # Followed by two numbers between 0 and 9.
130    # Followed by one or zero ' ', '.', or '-' characters.
131    # Followed by four numbers between 0 and 9.
132    pattern = r"(?<![0-9a-zA-Z])([(]?[1-9]{1}[0-9]{2}[)]?[ .-]?[1-9]{1}[0-9]{2}[ .-]?[0-9]{4})(?![0-9])"
133    numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)]
134    numbers = [
135        number
136        for number in numbers
137        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
138    ]
139    return sorted(set(numbers))

Scrape phone numbers from text using regex.