scrapetools.phone_scraper
1import re 2 3import phonenumbers 4 5 6def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int: 7 """Finds the number of consecutive numeric characters in a string.""" 8 # limit search to 10 characters 9 text[:10] 10 if reverse: 11 text = text[::-1] 12 for i, ch in enumerate(text): 13 if not ch.isnumeric(): 14 return i 15 return len(text) 16 17 18def find_by_separator(text: str, separator: str) -> list[str]: 19 """Attempts to detect phone numbers according to these 20 patterns by scanning for separators (typically '-.') 21 and how many consecutive numbers follow or precede them: 22 23 (xxx)xxx{separator}xxxx 24 25 (xxx) xxx{separator}xxxx 26 27 (xxx){separator}xxx{separator}xxxx 28 29 xxx{separator}xxx{separator}xxxx""" 30 count = text.count(separator) 31 numbers = [] 32 if count > 0: 33 last_stopdex = 0 34 for _ in range(count): 35 number = "" 36 sepdex = text.find(separator, last_stopdex) 37 if sepdex != -1: 38 next_sepdex = text.find(separator, sepdex + 1) 39 # consecutive numbers preceding sepdex 40 start_offset = get_num_consecutive_numbers( 41 text[last_stopdex:sepdex], reverse=True 42 ) 43 # consecutive numbers between sepdex and next_sepdex 44 first_stop_offset = get_num_consecutive_numbers( 45 text[sepdex + 1 : next_sepdex + 1] 46 ) 47 # consecutive numbers after next_sepdex 48 second_stop_offset = get_num_consecutive_numbers( 49 text[next_sepdex + 1 :] 50 ) 51 52 if ( 53 start_offset == 3 54 and first_stop_offset == 3 55 and second_stop_offset == 4 56 ): 57 # xxx{separator}xxx{separator}xxxx 58 number = text[ 59 sepdex - start_offset : next_sepdex + second_stop_offset + 1 60 ] 61 elif ( 62 start_offset == 0 63 and first_stop_offset == 3 64 and second_stop_offset == 4 65 and text[sepdex - 1] == ")" 66 and text[sepdex - 5] == "(" 67 ): 68 # (xxx){separator}xxx{separator}xxxx 69 number = text[ 70 sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2 71 ] 72 elif start_offset == 3 and text[sepdex - 4] in [")", " "]: 73 # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx 74 number = text[sepdex - 8 : sepdex + 5] 75 last_stopdex = sepdex + 5 76 for ch in [separator, "(", ")", " "]: 77 number = number.replace(ch, "") 78 if len(number) == 10 and all(ch.isnumeric() for ch in number): 79 numbers.append(number) 80 return numbers 81 82 83def find_by_href(text: str) -> list[str]: 84 """Scrapes phone numbers by href attribute.""" 85 indicator = 'href="' 86 count = text.count(indicator) 87 prefixes = ["tel:", "callto:"] 88 index = 0 89 numbers = [] 90 for _ in range(count): 91 index = text.find(indicator, index + 1) 92 number = text[index + len(indicator) : text.find('"', index + len(indicator))] 93 if any(prefix in number for prefix in prefixes): 94 number = "".join( 95 [num for num in number[number.find(":") + 1 :] if num.isnumeric()] 96 ) 97 if len(number) == 10: 98 numbers.append(number) 99 return numbers 100 101 102def scrape_phone_numbers_noregex(text: str) -> list[str]: 103 """Scrape for u.s. phone numbers.""" 104 numbers = [] 105 text = text.replace("+1", "") 106 for separator in "-.": 107 numbers.extend(find_by_separator(text, separator)) 108 numbers.extend(find_by_href(text)) 109 numbers = [ 110 number 111 for number in numbers 112 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 113 ] 114 numbers = sorted(list(set(numbers))) 115 return numbers 116 117 118def scrape_phone_numbers(text: str) -> list[str]: 119 """Scrape phone numbers from text using regex.""" 120 # Validation: 121 # Not preceeded by an alphanumeric character and not followed by a numeric character 122 # to avoid number strings in long urls and floats etc. 123 # One or zero '(' characters followed by a number between 1 and 9. 124 # Followed by two numbers between 0 and 9. 125 # Followed by one or zero ')' characters. 126 # Followed by one or zero ' ', '.', or '-' characters. 127 # Followed by one number between 1 and 9. 128 # Followed by two numbers between 0 and 9. 129 # Followed by one or zero ' ', '.', or '-' characters. 130 # Followed by four numbers between 0 and 9. 131 pattern = r"(?<![0-9a-zA-Z])([(]?[1-9]{1}[0-9]{2}[)]?[ .-]?[1-9]{1}[0-9]{2}[ .-]?[0-9]{4})(?![0-9])" 132 numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)] 133 numbers = [ 134 number 135 for number in numbers 136 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 137 ] 138 return sorted(set(numbers))
def
get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
7def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int: 8 """Finds the number of consecutive numeric characters in a string.""" 9 # limit search to 10 characters 10 text[:10] 11 if reverse: 12 text = text[::-1] 13 for i, ch in enumerate(text): 14 if not ch.isnumeric(): 15 return i 16 return len(text)
Finds the number of consecutive numeric characters in a string.
def
find_by_separator(text: str, separator: str) -> list[str]:
19def find_by_separator(text: str, separator: str) -> list[str]: 20 """Attempts to detect phone numbers according to these 21 patterns by scanning for separators (typically '-.') 22 and how many consecutive numbers follow or precede them: 23 24 (xxx)xxx{separator}xxxx 25 26 (xxx) xxx{separator}xxxx 27 28 (xxx){separator}xxx{separator}xxxx 29 30 xxx{separator}xxx{separator}xxxx""" 31 count = text.count(separator) 32 numbers = [] 33 if count > 0: 34 last_stopdex = 0 35 for _ in range(count): 36 number = "" 37 sepdex = text.find(separator, last_stopdex) 38 if sepdex != -1: 39 next_sepdex = text.find(separator, sepdex + 1) 40 # consecutive numbers preceding sepdex 41 start_offset = get_num_consecutive_numbers( 42 text[last_stopdex:sepdex], reverse=True 43 ) 44 # consecutive numbers between sepdex and next_sepdex 45 first_stop_offset = get_num_consecutive_numbers( 46 text[sepdex + 1 : next_sepdex + 1] 47 ) 48 # consecutive numbers after next_sepdex 49 second_stop_offset = get_num_consecutive_numbers( 50 text[next_sepdex + 1 :] 51 ) 52 53 if ( 54 start_offset == 3 55 and first_stop_offset == 3 56 and second_stop_offset == 4 57 ): 58 # xxx{separator}xxx{separator}xxxx 59 number = text[ 60 sepdex - start_offset : next_sepdex + second_stop_offset + 1 61 ] 62 elif ( 63 start_offset == 0 64 and first_stop_offset == 3 65 and second_stop_offset == 4 66 and text[sepdex - 1] == ")" 67 and text[sepdex - 5] == "(" 68 ): 69 # (xxx){separator}xxx{separator}xxxx 70 number = text[ 71 sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2 72 ] 73 elif start_offset == 3 and text[sepdex - 4] in [")", " "]: 74 # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx 75 number = text[sepdex - 8 : sepdex + 5] 76 last_stopdex = sepdex + 5 77 for ch in [separator, "(", ")", " "]: 78 number = number.replace(ch, "") 79 if len(number) == 10 and all(ch.isnumeric() for ch in number): 80 numbers.append(number) 81 return numbers
Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:
(xxx)xxx{separator}xxxx
(xxx) xxx{separator}xxxx
(xxx){separator}xxx{separator}xxxx
xxx{separator}xxx{separator}xxxx
def
find_by_href(text: str) -> list[str]:
84def find_by_href(text: str) -> list[str]: 85 """Scrapes phone numbers by href attribute.""" 86 indicator = 'href="' 87 count = text.count(indicator) 88 prefixes = ["tel:", "callto:"] 89 index = 0 90 numbers = [] 91 for _ in range(count): 92 index = text.find(indicator, index + 1) 93 number = text[index + len(indicator) : text.find('"', index + len(indicator))] 94 if any(prefix in number for prefix in prefixes): 95 number = "".join( 96 [num for num in number[number.find(":") + 1 :] if num.isnumeric()] 97 ) 98 if len(number) == 10: 99 numbers.append(number) 100 return numbers
Scrapes phone numbers by href attribute.
def
scrape_phone_numbers_noregex(text: str) -> list[str]:
103def scrape_phone_numbers_noregex(text: str) -> list[str]: 104 """Scrape for u.s. phone numbers.""" 105 numbers = [] 106 text = text.replace("+1", "") 107 for separator in "-.": 108 numbers.extend(find_by_separator(text, separator)) 109 numbers.extend(find_by_href(text)) 110 numbers = [ 111 number 112 for number in numbers 113 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 114 ] 115 numbers = sorted(list(set(numbers))) 116 return numbers
Scrape for u.s. phone numbers.
def
scrape_phone_numbers(text: str) -> list[str]:
119def scrape_phone_numbers(text: str) -> list[str]: 120 """Scrape phone numbers from text using regex.""" 121 # Validation: 122 # Not preceeded by an alphanumeric character and not followed by a numeric character 123 # to avoid number strings in long urls and floats etc. 124 # One or zero '(' characters followed by a number between 1 and 9. 125 # Followed by two numbers between 0 and 9. 126 # Followed by one or zero ')' characters. 127 # Followed by one or zero ' ', '.', or '-' characters. 128 # Followed by one number between 1 and 9. 129 # Followed by two numbers between 0 and 9. 130 # Followed by one or zero ' ', '.', or '-' characters. 131 # Followed by four numbers between 0 and 9. 132 pattern = r"(?<![0-9a-zA-Z])([(]?[1-9]{1}[0-9]{2}[)]?[ .-]?[1-9]{1}[0-9]{2}[ .-]?[0-9]{4})(?![0-9])" 133 numbers = [re.sub(r"[^0-9]", "", number) for number in re.findall(pattern, text)] 134 numbers = [ 135 number 136 for number in numbers 137 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 138 ] 139 return sorted(set(numbers))
Scrape phone numbers from text using regex.