Day 39 - Using BeautifulSoup to Parse a Web Page

We will use the requests library to open a web page, and BeautifulSoup to extract arts of the page itself.

Starter Code

import requests
from bs4 import BeautifulSoup

strUrl = "https://www.bmoseley.com/test.html"

diHeaders = {"User-Agent": "Mozilla/5.0 (compatible; BeautifulSoupDemo/1.0; +http://example.com/info)"}

Finished Code

import requests
from bs4 import BeautifulSoup

strUrl = "https://www.bmoseley.com/test.html"

diHeaders = {"User-Agent": "Mozilla/5.0 (compatible; BeautifulSoupDemo/1.0; +http://example.com/info)"}

response = requests.get(strUrl, headers=diHeaders)

if response.status_code != 200:
    print("There was an error!")
    exit()

strHtml = response.text

# print(strHtml)

soup = BeautifulSoup(strHtml, "html.parser")

print("Page title:")
if soup.title:
    print(soup.title.string)
else:
    print("No title.")

print("\nPage links:")
for link in soup.find_all("a"):
    strLink = link.get("href")
    if strLink:
        print(strLink)

print("\nParagraphs:")
for par in soup.find_all("p"):
    strP = par.get_text()
    if (strP):
        print(strP)

print("\nDemo element:")
demo = soup.select_one("#demo")
if demo:
    print(demo.get_text())

print("\nInfo class:")
liInfo = soup.select(".info")
if liInfo:
    for infoItem in liInfo:
        print(infoItem.get_text())

print("\nImages:")
for img in soup.find_all("img"):
    strSrc = img.get("src")
    strAlt = img.get("alt", "No alt text")
    print("Image Src: ", strSrc)
    print("Alt Text: ", strAlt)