We will combine the requests library, BeautifulSoup, and regular expressions to create a web crawler that can extract links, email addresses, and other information from an entire web site.
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
strRegExEmail = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
strRegExEmail = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b'
def crawl(strStartUrl):
urlParsed = urlparse(strStartUrl)
strDomain = urlParsed.netloc
setVisited = set()
setEmails = set()
setAllLinks = set()
liToVisit = [strStartUrl]
diHeaders = {
"User-Agent": "Mozilla/5.0 (compatible; WebCrawler/1.0; +http://example.com/info)"
}
while liToVisit:
strCurrentURL = liToVisit.pop(0)
if strCurrentURL in setVisited:
continue
print(f"Crawling: {strCurrentURL}")
setVisited.add(strCurrentURL)
try:
response = requests.get(strCurrentURL, headers=diHeaders, timeout=10)
if response.status_code != 200:
continue
except Exception as e:
print(f"Error in {strCurrentURL}: {e}")
continue
soup = BeautifulSoup(response.text, "html.parser")
for a_tag in soup.find_all("a", href=True):
strHref = a_tag["href"]
if strHref.startswith("mailto:"):
strEmail = strHref.split("mailto:")[1].split('?')[0]
if strEmail:
setEmails.add(strEmail)
continue
strFullURL = urljoin(strCurrentURL, strHref)
strParsedLink = urlparse(strFullURL)
if strParsedLink.scheme in ['http', 'https'] and strParsedLink.netloc == strDomain:
if strFullURL not in setAllLinks:
setAllLinks.add(strFullURL)
if strFullURL not in setVisited and strFullURL not in liToVisit:
liToVisit.append(strFullURL)
strPageText = soup.get_text()
liFoundEmails = re.findall(strRegExEmail, strPageText)
for strEmail in liFoundEmails:
setEmails.add(strEmail)
print("Crawler Report:")
print("Emails:")
for strEmail in setEmails:
print(strEmail)
print("\nLinks Found:")
for strURL in setAllLinks:
print(strURL)
print("\nVisited URLs:")
for strURL in setVisited:
print(strURL)
crawl("http://www.bmoseley.com/test.html")