scrapetools.phone_scraper

  1import phonenumbers
  2
  3
  4def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
  5    """Finds the number of consecutive numeric characters in a string."""
  6    # limit search to 10 characters
  7    text[:10]
  8    if reverse:
  9        text = text[::-1]
 10    for i, ch in enumerate(text):
 11        if not ch.isnumeric():
 12            return i
 13    return len(text)
 14
 15
 16def find_by_separator(text: str, separator: str) -> list[str]:
 17    """Attempts to detect phone numbers according to these
 18    patterns by scanning for separators (typically '-.')
 19    and how many consecutive numbers follow or precede them:
 20
 21    (xxx)xxx{separator}xxxx
 22
 23    (xxx) xxx{separator}xxxx
 24
 25    (xxx){separator}xxx{separator}xxxx
 26
 27    xxx{separator}xxx{separator}xxxx"""
 28    count = text.count(separator)
 29    numbers = []
 30    if count > 0:
 31        last_stopdex = 0
 32        for _ in range(count):
 33            number = ""
 34            sepdex = text.find(separator, last_stopdex)
 35            if sepdex != -1:
 36                next_sepdex = text.find(separator, sepdex + 1)
 37                # consecutive numbers preceding sepdex
 38                start_offset = get_num_consecutive_numbers(
 39                    text[last_stopdex:sepdex], reverse=True
 40                )
 41                # consecutive numbers between sepdex and next_sepdex
 42                first_stop_offset = get_num_consecutive_numbers(
 43                    text[sepdex + 1 : next_sepdex + 1]
 44                )
 45                # consecutive numbers after next_sepdex
 46                second_stop_offset = get_num_consecutive_numbers(
 47                    text[next_sepdex + 1 :]
 48                )
 49
 50                if (
 51                    start_offset == 3
 52                    and first_stop_offset == 3
 53                    and second_stop_offset == 4
 54                ):
 55                    # xxx{separator}xxx{separator}xxxx
 56                    number = text[
 57                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
 58                    ]
 59                elif (
 60                    start_offset == 0
 61                    and first_stop_offset == 3
 62                    and second_stop_offset == 4
 63                    and text[sepdex - 1] == ")"
 64                    and text[sepdex - 5] == "("
 65                ):
 66                    # (xxx){separator}xxx{separator}xxxx
 67                    number = text[
 68                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
 69                    ]
 70                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
 71                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
 72                    number = text[sepdex - 8 : sepdex + 5]
 73                last_stopdex = sepdex + 5
 74                for ch in [separator, "(", ")", " "]:
 75                    number = number.replace(ch, "")
 76                if len(number) == 10 and all(ch.isnumeric() for ch in number):
 77                    numbers.append(number)
 78    return numbers
 79
 80
 81def find_by_href(text: str) -> list[str]:
 82    """Scrapes phone numbers by href attribute."""
 83    indicator = 'href="'
 84    count = text.count(indicator)
 85    prefixes = ["tel:", "callto:"]
 86    index = 0
 87    numbers = []
 88    for _ in range(count):
 89        index = text.find(indicator, index + 1)
 90        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
 91        if any(prefix in number for prefix in prefixes):
 92            number = "".join(
 93                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
 94            )
 95            if len(number) == 10:
 96                numbers.append(number)
 97    return numbers
 98
 99
100def scrape_phone_numbers(text: str) -> list[str]:
101    """Scrape for u.s. phone numbers."""
102    numbers = []
103    text = text.replace("+1", "")
104    for separator in "-.":
105        numbers.extend(find_by_separator(text, separator))
106    numbers.extend(find_by_href(text))
107    numbers = [
108        number
109        for number in numbers
110        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
111    ]
112    numbers = sorted(list(set(numbers)))
113    return numbers
def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
 5def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
 6    """Finds the number of consecutive numeric characters in a string."""
 7    # limit search to 10 characters
 8    text[:10]
 9    if reverse:
10        text = text[::-1]
11    for i, ch in enumerate(text):
12        if not ch.isnumeric():
13            return i
14    return len(text)

Finds the number of consecutive numeric characters in a string.

def find_by_separator(text: str, separator: str) -> list[str]:
17def find_by_separator(text: str, separator: str) -> list[str]:
18    """Attempts to detect phone numbers according to these
19    patterns by scanning for separators (typically '-.')
20    and how many consecutive numbers follow or precede them:
21
22    (xxx)xxx{separator}xxxx
23
24    (xxx) xxx{separator}xxxx
25
26    (xxx){separator}xxx{separator}xxxx
27
28    xxx{separator}xxx{separator}xxxx"""
29    count = text.count(separator)
30    numbers = []
31    if count > 0:
32        last_stopdex = 0
33        for _ in range(count):
34            number = ""
35            sepdex = text.find(separator, last_stopdex)
36            if sepdex != -1:
37                next_sepdex = text.find(separator, sepdex + 1)
38                # consecutive numbers preceding sepdex
39                start_offset = get_num_consecutive_numbers(
40                    text[last_stopdex:sepdex], reverse=True
41                )
42                # consecutive numbers between sepdex and next_sepdex
43                first_stop_offset = get_num_consecutive_numbers(
44                    text[sepdex + 1 : next_sepdex + 1]
45                )
46                # consecutive numbers after next_sepdex
47                second_stop_offset = get_num_consecutive_numbers(
48                    text[next_sepdex + 1 :]
49                )
50
51                if (
52                    start_offset == 3
53                    and first_stop_offset == 3
54                    and second_stop_offset == 4
55                ):
56                    # xxx{separator}xxx{separator}xxxx
57                    number = text[
58                        sepdex - start_offset : next_sepdex + second_stop_offset + 1
59                    ]
60                elif (
61                    start_offset == 0
62                    and first_stop_offset == 3
63                    and second_stop_offset == 4
64                    and text[sepdex - 1] == ")"
65                    and text[sepdex - 5] == "("
66                ):
67                    # (xxx){separator}xxx{separator}xxxx
68                    number = text[
69                        sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2
70                    ]
71                elif start_offset == 3 and text[sepdex - 4] in [")", " "]:
72                    # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx
73                    number = text[sepdex - 8 : sepdex + 5]
74                last_stopdex = sepdex + 5
75                for ch in [separator, "(", ")", " "]:
76                    number = number.replace(ch, "")
77                if len(number) == 10 and all(ch.isnumeric() for ch in number):
78                    numbers.append(number)
79    return numbers

Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:

(xxx)xxx{separator}xxxx

(xxx) xxx{separator}xxxx

(xxx){separator}xxx{separator}xxxx

xxx{separator}xxx{separator}xxxx

def find_by_href(text: str) -> list[str]:
82def find_by_href(text: str) -> list[str]:
83    """Scrapes phone numbers by href attribute."""
84    indicator = 'href="'
85    count = text.count(indicator)
86    prefixes = ["tel:", "callto:"]
87    index = 0
88    numbers = []
89    for _ in range(count):
90        index = text.find(indicator, index + 1)
91        number = text[index + len(indicator) : text.find('"', index + len(indicator))]
92        if any(prefix in number for prefix in prefixes):
93            number = "".join(
94                [num for num in number[number.find(":") + 1 :] if num.isnumeric()]
95            )
96            if len(number) == 10:
97                numbers.append(number)
98    return numbers

Scrapes phone numbers by href attribute.

def scrape_phone_numbers(text: str) -> list[str]:
101def scrape_phone_numbers(text: str) -> list[str]:
102    """Scrape for u.s. phone numbers."""
103    numbers = []
104    text = text.replace("+1", "")
105    for separator in "-.":
106        numbers.extend(find_by_separator(text, separator))
107    numbers.extend(find_by_href(text))
108    numbers = [
109        number
110        for number in numbers
111        if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number))
112    ]
113    numbers = sorted(list(set(numbers)))
114    return numbers

Scrape for u.s. phone numbers.