Web Scraping Assignment 2
Table of Contents
Beginning
The goal of this exercise is to crawl through a set of anchor links to get a particular name stored in the nth anchor tag. The assignment specifically says to use urllib but, if you go to the documentation for urllib.request it tells you to use to use requests, which, if you go to its documentation says that it's in maintenance mode while work is being done on Requests III… anyway, I like using Requests-HTML so I'll use that and urllib side-by-side.
Imports
Python
import re
import urllib
PyPi
from bs4 import BeautifulSoup
from requests_html import HTMLSession
Setup
The URL
BASE_URL = "http://py4e-data.dr-chuck.net/known_by_"
SAMPLE_URL = f"{BASE_URL}Fikret.html"
ASSIGNMENT_URL = f"{BASE_URL}Abdalroof.html"
Middle
The Sample Exercise
The Easy Way
session = HTMLSession()
response = session.get(SAMPLE_URL)
assert response.ok
expression = "_(?P<name>[^_.]+).html"
expression = re.compile(expression)
print(f"Name: {expression.search(SAMPLE_URL).group('name')}")
for hop in range(4):
links = response.html.find("a")
link_element = links[2]
print(f"Name: {link_element.text}")
response = session.get(link_element.attrs["href"])
print(f"Final Answer: {link_element.text}")
Name: Fikret Name: Montgomery Name: Mhairade Name: Butchi Name: Anayah Final Answer: Anayah
The Slightly Less Easy Way
response = urllib.request.urlopen(SAMPLE_URL)
print(f"Name: {expression.search(SAMPLE_URL).group('name')}")
for hop in range(4):
soup = BeautifulSoup(response.read(), "html.parser")
link_element = soup.find_all("a")[2]
print(f"Name: {link_element.text}")
response = urllib.request.urlopen(link_element["href"])
print(f"\nFinal Answer: {link_element.text}")
Name: Fikret Name: Montgomery Name: Mhairade Name: Butchi Name: Anayah Final Answer: Anayah
The Real One
The Easy Way
session = HTMLSession()
response = session.get(ASSIGNMENT_URL)
assert response.ok
expression = "_(?P<name>[^_.]+).html"
expression = re.compile(expression)
print(f"Name: {expression.search(ASSIGNMENT_URL).group('name')}")
for hop in range(7):
links = response.html.find("a")
link_element = links[17]
print(f"Name: {link_element.text}")
response = session.get(link_element.attrs["href"])
print(f"Final Answer: {link_element.text}")
Name: Abdalroof Name: Billi Name: Jayse Name: Amaarah Name: Cesar Name: Rosheen Name: Mohamed Name: Kiara Final Answer: Kiara
The Assignment Way
HOPS = 7
FIND_AT_INDEX = 18 - 1
response = urllib.request.urlopen(ASSIGNMENT_URL)
print(f"Name: {expression.search(ASSIGNMENT_URL).group('name')}")
for hop in range(HOPS):
soup = BeautifulSoup(response.read(), "html.parser")
link_element = soup.find_all("a")[FIND_AT_INDEX]
print(f"Name: {link_element.text}")
response = urllib.request.urlopen(link_element["href"])
print(f"\nFinal Answer: {link_element.text}")
Name: Abdalroof Name: Billi Name: Jayse Name: Amaarah Name: Cesar Name: Rosheen Name: Mohamed Name: Kiara Final Answer: Kiara