We will take our extraction exercises further, pulling email addresses from an actual web page on a live web server.
import requests
from bs4 import BeautifulSoup
import re
strUrl = "https://www.bmoseley.com/testemail.html"
diHeader = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}
import requests
from bs4 import BeautifulSoup
import re
strUrl = "https://www.bmoseley.com/testemail.html"
diHeader = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"}
response = requests.get(strUrl, headers=diHeader)
strHTML = response.text
soup = BeautifulSoup(strHTML, "html.parser")
setEmails = set()
for link in soup.find_all('a', href=True):
strHref = link['href']
if strHref.startswith('mailto:'):
strEmail = strHref[7:]
setEmails.add(strEmail.strip())
strText = soup.get_text()
liFound = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", strText)
for strEmail in liFound:
setEmails.add(strEmail.strip())
for strEmail in setEmails:
print(strEmail)