scrapetools.phone_scraper
1import phonenumbers 2 3 4def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int: 5 """Finds the number of consecutive numeric characters in a string.""" 6 # limit search to 10 characters 7 text[:10] 8 if reverse: 9 text = text[::-1] 10 for i, ch in enumerate(text): 11 if not ch.isnumeric(): 12 return i 13 return len(text) 14 15 16def find_by_separator(text: str, separator: str) -> list[str]: 17 """Attempts to detect phone numbers according to these 18 patterns by scanning for separators (typically '-.') 19 and how many consecutive numbers follow or precede them: 20 21 (xxx)xxx{separator}xxxx 22 23 (xxx) xxx{separator}xxxx 24 25 (xxx){separator}xxx{separator}xxxx 26 27 xxx{separator}xxx{separator}xxxx""" 28 count = text.count(separator) 29 numbers = [] 30 if count > 0: 31 last_stopdex = 0 32 for _ in range(count): 33 number = "" 34 sepdex = text.find(separator, last_stopdex) 35 if sepdex != -1: 36 next_sepdex = text.find(separator, sepdex + 1) 37 # consecutive numbers preceding sepdex 38 start_offset = get_num_consecutive_numbers( 39 text[last_stopdex:sepdex], reverse=True 40 ) 41 # consecutive numbers between sepdex and next_sepdex 42 first_stop_offset = get_num_consecutive_numbers( 43 text[sepdex + 1 : next_sepdex + 1] 44 ) 45 # consecutive numbers after next_sepdex 46 second_stop_offset = get_num_consecutive_numbers( 47 text[next_sepdex + 1 :] 48 ) 49 50 if ( 51 start_offset == 3 52 and first_stop_offset == 3 53 and second_stop_offset == 4 54 ): 55 # xxx{separator}xxx{separator}xxxx 56 number = text[ 57 sepdex - start_offset : next_sepdex + second_stop_offset + 1 58 ] 59 elif ( 60 start_offset == 0 61 and first_stop_offset == 3 62 and second_stop_offset == 4 63 and text[sepdex - 1] == ")" 64 and text[sepdex - 5] == "(" 65 ): 66 # (xxx){separator}xxx{separator}xxxx 67 number = text[ 68 sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2 69 ] 70 elif start_offset == 3 and text[sepdex - 4] in [")", " "]: 71 # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx 72 number = text[sepdex - 8 : sepdex + 5] 73 last_stopdex = sepdex + 5 74 for ch in [separator, "(", ")", " "]: 75 number = number.replace(ch, "") 76 if len(number) == 10 and all(ch.isnumeric() for ch in number): 77 numbers.append(number) 78 return numbers 79 80 81def find_by_href(text: str) -> list[str]: 82 """Scrapes phone numbers by href attribute.""" 83 indicator = 'href="' 84 count = text.count(indicator) 85 prefixes = ["tel:", "callto:"] 86 index = 0 87 numbers = [] 88 for _ in range(count): 89 index = text.find(indicator, index + 1) 90 number = text[index + len(indicator) : text.find('"', index + len(indicator))] 91 if any(prefix in number for prefix in prefixes): 92 number = "".join( 93 [num for num in number[number.find(":") + 1 :] if num.isnumeric()] 94 ) 95 if len(number) == 10: 96 numbers.append(number) 97 return numbers 98 99 100def scrape_phone_numbers(text: str) -> list[str]: 101 """Scrape for u.s. phone numbers.""" 102 numbers = [] 103 text = text.replace("+1", "") 104 for separator in "-.": 105 numbers.extend(find_by_separator(text, separator)) 106 numbers.extend(find_by_href(text)) 107 numbers = [ 108 number 109 for number in numbers 110 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 111 ] 112 numbers = sorted(list(set(numbers))) 113 return numbers
def
get_num_consecutive_numbers(text: str, reverse: bool = False) -> int:
5def get_num_consecutive_numbers(text: str, reverse: bool = False) -> int: 6 """Finds the number of consecutive numeric characters in a string.""" 7 # limit search to 10 characters 8 text[:10] 9 if reverse: 10 text = text[::-1] 11 for i, ch in enumerate(text): 12 if not ch.isnumeric(): 13 return i 14 return len(text)
Finds the number of consecutive numeric characters in a string.
def
find_by_separator(text: str, separator: str) -> list[str]:
17def find_by_separator(text: str, separator: str) -> list[str]: 18 """Attempts to detect phone numbers according to these 19 patterns by scanning for separators (typically '-.') 20 and how many consecutive numbers follow or precede them: 21 22 (xxx)xxx{separator}xxxx 23 24 (xxx) xxx{separator}xxxx 25 26 (xxx){separator}xxx{separator}xxxx 27 28 xxx{separator}xxx{separator}xxxx""" 29 count = text.count(separator) 30 numbers = [] 31 if count > 0: 32 last_stopdex = 0 33 for _ in range(count): 34 number = "" 35 sepdex = text.find(separator, last_stopdex) 36 if sepdex != -1: 37 next_sepdex = text.find(separator, sepdex + 1) 38 # consecutive numbers preceding sepdex 39 start_offset = get_num_consecutive_numbers( 40 text[last_stopdex:sepdex], reverse=True 41 ) 42 # consecutive numbers between sepdex and next_sepdex 43 first_stop_offset = get_num_consecutive_numbers( 44 text[sepdex + 1 : next_sepdex + 1] 45 ) 46 # consecutive numbers after next_sepdex 47 second_stop_offset = get_num_consecutive_numbers( 48 text[next_sepdex + 1 :] 49 ) 50 51 if ( 52 start_offset == 3 53 and first_stop_offset == 3 54 and second_stop_offset == 4 55 ): 56 # xxx{separator}xxx{separator}xxxx 57 number = text[ 58 sepdex - start_offset : next_sepdex + second_stop_offset + 1 59 ] 60 elif ( 61 start_offset == 0 62 and first_stop_offset == 3 63 and second_stop_offset == 4 64 and text[sepdex - 1] == ")" 65 and text[sepdex - 5] == "(" 66 ): 67 # (xxx){separator}xxx{separator}xxxx 68 number = text[ 69 sepdex - 5 : sepdex + first_stop_offset + second_stop_offset + 2 70 ] 71 elif start_offset == 3 and text[sepdex - 4] in [")", " "]: 72 # (xxx)xxx{separator}xxxx or (xxx) xxx{separator}xxxx 73 number = text[sepdex - 8 : sepdex + 5] 74 last_stopdex = sepdex + 5 75 for ch in [separator, "(", ")", " "]: 76 number = number.replace(ch, "") 77 if len(number) == 10 and all(ch.isnumeric() for ch in number): 78 numbers.append(number) 79 return numbers
Attempts to detect phone numbers according to these patterns by scanning for separators (typically '-.') and how many consecutive numbers follow or precede them:
(xxx)xxx{separator}xxxx
(xxx) xxx{separator}xxxx
(xxx){separator}xxx{separator}xxxx
xxx{separator}xxx{separator}xxxx
def
find_by_href(text: str) -> list[str]:
82def find_by_href(text: str) -> list[str]: 83 """Scrapes phone numbers by href attribute.""" 84 indicator = 'href="' 85 count = text.count(indicator) 86 prefixes = ["tel:", "callto:"] 87 index = 0 88 numbers = [] 89 for _ in range(count): 90 index = text.find(indicator, index + 1) 91 number = text[index + len(indicator) : text.find('"', index + len(indicator))] 92 if any(prefix in number for prefix in prefixes): 93 number = "".join( 94 [num for num in number[number.find(":") + 1 :] if num.isnumeric()] 95 ) 96 if len(number) == 10: 97 numbers.append(number) 98 return numbers
Scrapes phone numbers by href attribute.
def
scrape_phone_numbers(text: str) -> list[str]:
101def scrape_phone_numbers(text: str) -> list[str]: 102 """Scrape for u.s. phone numbers.""" 103 numbers = [] 104 text = text.replace("+1", "") 105 for separator in "-.": 106 numbers.extend(find_by_separator(text, separator)) 107 numbers.extend(find_by_href(text)) 108 numbers = [ 109 number 110 for number in numbers 111 if phonenumbers.is_valid_number(phonenumbers.parse("+1" + number)) 112 ] 113 numbers = sorted(list(set(numbers))) 114 return numbers
Scrape for u.s. phone numbers.