Day 38 - Extract Email From a Web Page

We will take our extraction exercises further, pulling email addresses from an actual web page on a live web server.

Starter Code

import requests
from bs4 import BeautifulSoup
import re

strUrl = "https://www.bmoseley.com/testemail.html"

diHeader = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}

Finished Code

import requests
from bs4 import BeautifulSoup
import re

strUrl = "https://www.bmoseley.com/testemail.html"

diHeader = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}

response = requests.get(strUrl, headers=diHeader)
strHTML = response.text

soup = BeautifulSoup(strHTML, "html.parser")

setEmails = set()

for link in soup.find_all('a', href=True):
    strHref = link['href']
    if strHref.startswith('mailto:'):
        strEmail = strHref[7:]
        setEmails.add(strEmail.strip())

strText = soup.get_text()
liFound = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", strText)
for strEmail in liFound:
    setEmails.add(strEmail.strip())

for strEmail in setEmails:
    print(strEmail)