Watch on YouTube

Day 40 - Building a Web Scraper with Link Crawling

We will combine the requests library, BeautifulSoup, and regular expressions to create a web crawler that can extract links, email addresses, and other information from an entire web site.

Starter Code

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse

strRegExEmail = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'

Finished Code

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse

strRegExEmail = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'

def crawl(strStartUrl):
    urlParsed = urlparse(strStartUrl)
    strDomain = urlParsed.netloc

    setVisited = set()
    setEmails = set()
    setAllLinks = set()
    liToVisit = [strStartUrl]

    diHeaders = {
        "User-Agent": "Mozilla/5.0 (compatible; WebCrawler/1.0; +http://example.com/info)"
    }

    while liToVisit:
        strCurrentURL = liToVisit.pop(0)

        if strCurrentURL in setVisited:
            continue

        print(f"Crawling: {strCurrentURL}")
        setVisited.add(strCurrentURL)

        try:
            response = requests.get(strCurrentURL, headers=diHeaders, timeout=10)

            if response.status_code != 200:
                continue
        except Exception as e:
            print(f"Error in {strCurrentURL}: {e}")
            continue

        soup = BeautifulSoup(response.text, "html.parser")

        for a_tag in soup.find_all("a", href=True):
            strHref = a_tag["href"]

            if strHref.startswith("mailto:"):
                strEmail = strHref.split("mailto:")[1].split('?')[0]
                if strEmail:
                    setEmails.add(strEmail)
                continue

            strFullURL = urljoin(strCurrentURL, strHref)
            strParsedLink = urlparse(strFullURL)
            if strParsedLink.scheme in ['http', 'https'] and strParsedLink.netloc == strDomain:
                if strFullURL not in setAllLinks:
                    setAllLinks.add(strFullURL)
                if strFullURL not in setVisited and strFullURL not in liToVisit:
                    liToVisit.append(strFullURL)


        strPageText = soup.get_text()
        liFoundEmails = re.findall(strRegExEmail, strPageText)
        for strEmail in liFoundEmails:
            setEmails.add(strEmail)


    print("Crawler Report:")

    print("Emails:")
    for strEmail in setEmails:
        print(strEmail)

    print("\nLinks Found:")
    for strURL in setAllLinks:
        print(strURL)

    print("\nVisited URLs:")
    for strURL in setVisited:
        print(strURL)

crawl("http://www.bmoseley.com/test.html")