scrapetools.link_scraper

  1from urllib.parse import urlparse, urlunparse
  2
  3from bs4 import BeautifulSoup
  4
  5
  6class LinkScraper:
  7    def __init__(self, html_src: str, page_url: str):
  8        self.soup = BeautifulSoup(html_src, features="html.parser")
  9        self.parsed_url = urlparse(page_url)
 10        self.page_links = []
 11        self.img_links = []
 12        self.script_links = []
 13
 14    def format_relative_links(self, links: list[str]) -> list[str]:
 15        """Parses list of links and constructs a full url
 16        according to self.parsed_url for the ones that don't have a
 17        'netloc' property returned by urlparse.
 18
 19        Full urls are returned unedited other than stripping any
 20        leading or trailing forward slashes."""
 21        formatted_links = []
 22        for link in links:
 23            link = (
 24                link.strip(" \n\t\r")
 25                .replace('"', "")
 26                .replace("\\", "")
 27                .replace("'", "")
 28            )
 29            parsed_url = urlparse(link)
 30            if all(ch not in link for ch in "@ "):
 31                parsed_url = list(parsed_url)
 32                if parsed_url[0] == "":
 33                    parsed_url[0] = self.parsed_url.scheme
 34                if parsed_url[1] == "":
 35                    parsed_url[1] = self.parsed_url.netloc
 36                formatted_links.append(urlunparse(parsed_url).strip("/"))
 37        return formatted_links
 38
 39    def remove_duplicates(self, obj: list) -> list:
 40        """Removes duplicate members."""
 41        return list(set(obj))
 42
 43    def process_links(self, links: list[str]) -> list[str]:
 44        """Formats relative links, removes duplicates, and sorts in alphabetical order."""
 45        return sorted(self.remove_duplicates(self.format_relative_links(links)))
 46
 47    def find_all(self, tag_name: str, attribute_name: str) -> list[str]:
 48        """Finds all results according to tag_name and attribute_name.\n
 49        Filters out fragments."""
 50        return [
 51            tag.get(attribute_name)
 52            for tag in self.soup(tag_name, recursive=True)
 53            if tag.get(attribute_name) is not None
 54            and "#" not in tag.get(attribute_name)
 55        ]
 56
 57    def filter_same_site(self, links: list[str]) -> list[str]:
 58        """Filters out links that don't match self.parsed_url.netloc"""
 59        return [
 60            link
 61            for link in links
 62            if urlparse(link).netloc.strip("www.")
 63            == self.parsed_url.netloc.strip("www.")
 64        ]
 65
 66    def scrape_page_links(self):
 67        """Scrape links according to tags and attributes."""
 68        links = []
 69        for tag, attribute in [
 70            ("a", "href"),
 71            ("link", "href"),
 72            ("source", "src"),
 73            ("div", "src"),
 74            ("div", "data-src"),
 75            ("div", "data-url"),
 76            ("div", "href"),
 77        ]:
 78            links.extend(self.find_all(tag, attribute))
 79        self.page_links = self.process_links(links)
 80
 81    def scrape_img_links(self):
 82        """Scrape links from src attribute of <img> tags."""
 83        self.img_links = self.process_links(
 84            self.find_all("img", "src") + self.find_all("img", "data-src")
 85        )
 86
 87    def scrape_script_links(self):
 88        """Scrape script links from src attribute of <script> tags."""
 89        self.script_links = self.process_links(self.find_all("script", "src"))
 90
 91    def scrape_page(self):
 92        """Scrape all link types."""
 93        for scrape in [
 94            self.scrape_page_links,
 95            self.scrape_img_links,
 96            self.scrape_script_links,
 97        ]:
 98            scrape()
 99        self.merge_image_links_from_non_img_tags()
100
101    def merge_image_links_from_non_img_tags(self):
102        """Finds links in self.script_links and self.page_links
103        that have one of these image file extensions and adds them
104        to self.img_links"""
105        formats = [
106            ".jpg",
107            ".jpeg",
108            ".png",
109            ".svg",
110            ".bmp",
111            ".tiff",
112            ".pdf",
113            ".eps",
114            ".gif",
115            ".jfif",
116            ".webp",
117            ".heif",
118            ".avif",
119            ".bat",
120            ".bpg",
121        ]
122        for link in self.script_links + self.page_links:
123            if any(ext in link for ext in formats):
124                self.img_links.append(link)
125        self.img_links = sorted(self.remove_duplicates(self.img_links))
126
127    def get_links(
128        self,
129        link_type: str = "all",
130        same_site_only: bool = False,
131        excluded_links: list[str] = None,
132    ) -> list[str]:
133        """Returns a list of urls found on the page.
134
135        :param link_type: Can be 'all', 'page', 'img', or 'script'.
136
137        :param same_site_only: Excludes external urls if True.
138
139        :param excluded_links: A list of urls to filter out of the results.
140        Useful for excluding duplicates when recursively scraping a website.
141        Can also be used with link_type='all' to get two link types in one call:
142
143        e.g. links = scraper.get_links(link_type = 'all', excluded_links = scraper.script_links)
144        will return page links and img links."""
145        match link_type:
146            case "all":
147                links = self.remove_duplicates(
148                    self.page_links + self.img_links + self.script_links
149                )
150            case "page":
151                links = self.page_links
152            case "img":
153                links = self.img_links
154            case "script":
155                links = self.script_links
156        if same_site_only:
157            links = self.filter_same_site(links)
158        if excluded_links:
159            links = [link for link in links if link not in excluded_links]
160        return sorted(links)
class LinkScraper:
  7class LinkScraper:
  8    def __init__(self, html_src: str, page_url: str):
  9        self.soup = BeautifulSoup(html_src, features="html.parser")
 10        self.parsed_url = urlparse(page_url)
 11        self.page_links = []
 12        self.img_links = []
 13        self.script_links = []
 14
 15    def format_relative_links(self, links: list[str]) -> list[str]:
 16        """Parses list of links and constructs a full url
 17        according to self.parsed_url for the ones that don't have a
 18        'netloc' property returned by urlparse.
 19
 20        Full urls are returned unedited other than stripping any
 21        leading or trailing forward slashes."""
 22        formatted_links = []
 23        for link in links:
 24            link = (
 25                link.strip(" \n\t\r")
 26                .replace('"', "")
 27                .replace("\\", "")
 28                .replace("'", "")
 29            )
 30            parsed_url = urlparse(link)
 31            if all(ch not in link for ch in "@ "):
 32                parsed_url = list(parsed_url)
 33                if parsed_url[0] == "":
 34                    parsed_url[0] = self.parsed_url.scheme
 35                if parsed_url[1] == "":
 36                    parsed_url[1] = self.parsed_url.netloc
 37                formatted_links.append(urlunparse(parsed_url).strip("/"))
 38        return formatted_links
 39
 40    def remove_duplicates(self, obj: list) -> list:
 41        """Removes duplicate members."""
 42        return list(set(obj))
 43
 44    def process_links(self, links: list[str]) -> list[str]:
 45        """Formats relative links, removes duplicates, and sorts in alphabetical order."""
 46        return sorted(self.remove_duplicates(self.format_relative_links(links)))
 47
 48    def find_all(self, tag_name: str, attribute_name: str) -> list[str]:
 49        """Finds all results according to tag_name and attribute_name.\n
 50        Filters out fragments."""
 51        return [
 52            tag.get(attribute_name)
 53            for tag in self.soup(tag_name, recursive=True)
 54            if tag.get(attribute_name) is not None
 55            and "#" not in tag.get(attribute_name)
 56        ]
 57
 58    def filter_same_site(self, links: list[str]) -> list[str]:
 59        """Filters out links that don't match self.parsed_url.netloc"""
 60        return [
 61            link
 62            for link in links
 63            if urlparse(link).netloc.strip("www.")
 64            == self.parsed_url.netloc.strip("www.")
 65        ]
 66
 67    def scrape_page_links(self):
 68        """Scrape links according to tags and attributes."""
 69        links = []
 70        for tag, attribute in [
 71            ("a", "href"),
 72            ("link", "href"),
 73            ("source", "src"),
 74            ("div", "src"),
 75            ("div", "data-src"),
 76            ("div", "data-url"),
 77            ("div", "href"),
 78        ]:
 79            links.extend(self.find_all(tag, attribute))
 80        self.page_links = self.process_links(links)
 81
 82    def scrape_img_links(self):
 83        """Scrape links from src attribute of <img> tags."""
 84        self.img_links = self.process_links(
 85            self.find_all("img", "src") + self.find_all("img", "data-src")
 86        )
 87
 88    def scrape_script_links(self):
 89        """Scrape script links from src attribute of <script> tags."""
 90        self.script_links = self.process_links(self.find_all("script", "src"))
 91
 92    def scrape_page(self):
 93        """Scrape all link types."""
 94        for scrape in [
 95            self.scrape_page_links,
 96            self.scrape_img_links,
 97            self.scrape_script_links,
 98        ]:
 99            scrape()
100        self.merge_image_links_from_non_img_tags()
101
102    def merge_image_links_from_non_img_tags(self):
103        """Finds links in self.script_links and self.page_links
104        that have one of these image file extensions and adds them
105        to self.img_links"""
106        formats = [
107            ".jpg",
108            ".jpeg",
109            ".png",
110            ".svg",
111            ".bmp",
112            ".tiff",
113            ".pdf",
114            ".eps",
115            ".gif",
116            ".jfif",
117            ".webp",
118            ".heif",
119            ".avif",
120            ".bat",
121            ".bpg",
122        ]
123        for link in self.script_links + self.page_links:
124            if any(ext in link for ext in formats):
125                self.img_links.append(link)
126        self.img_links = sorted(self.remove_duplicates(self.img_links))
127
128    def get_links(
129        self,
130        link_type: str = "all",
131        same_site_only: bool = False,
132        excluded_links: list[str] = None,
133    ) -> list[str]:
134        """Returns a list of urls found on the page.
135
136        :param link_type: Can be 'all', 'page', 'img', or 'script'.
137
138        :param same_site_only: Excludes external urls if True.
139
140        :param excluded_links: A list of urls to filter out of the results.
141        Useful for excluding duplicates when recursively scraping a website.
142        Can also be used with link_type='all' to get two link types in one call:
143
144        e.g. links = scraper.get_links(link_type = 'all', excluded_links = scraper.script_links)
145        will return page links and img links."""
146        match link_type:
147            case "all":
148                links = self.remove_duplicates(
149                    self.page_links + self.img_links + self.script_links
150                )
151            case "page":
152                links = self.page_links
153            case "img":
154                links = self.img_links
155            case "script":
156                links = self.script_links
157        if same_site_only:
158            links = self.filter_same_site(links)
159        if excluded_links:
160            links = [link for link in links if link not in excluded_links]
161        return sorted(links)
LinkScraper(html_src: str, page_url: str)
 8    def __init__(self, html_src: str, page_url: str):
 9        self.soup = BeautifulSoup(html_src, features="html.parser")
10        self.parsed_url = urlparse(page_url)
11        self.page_links = []
12        self.img_links = []
13        self.script_links = []
def remove_duplicates(self, obj: list) -> list:
40    def remove_duplicates(self, obj: list) -> list:
41        """Removes duplicate members."""
42        return list(set(obj))

Removes duplicate members.

def find_all(self, tag_name: str, attribute_name: str) -> list[str]:
48    def find_all(self, tag_name: str, attribute_name: str) -> list[str]:
49        """Finds all results according to tag_name and attribute_name.\n
50        Filters out fragments."""
51        return [
52            tag.get(attribute_name)
53            for tag in self.soup(tag_name, recursive=True)
54            if tag.get(attribute_name) is not None
55            and "#" not in tag.get(attribute_name)
56        ]

Finds all results according to tag_name and attribute_name.

Filters out fragments.

def filter_same_site(self, links: list[str]) -> list[str]:
58    def filter_same_site(self, links: list[str]) -> list[str]:
59        """Filters out links that don't match self.parsed_url.netloc"""
60        return [
61            link
62            for link in links
63            if urlparse(link).netloc.strip("www.")
64            == self.parsed_url.netloc.strip("www.")
65        ]

Filters out links that don't match self.parsed_url.netloc