Table of Contents
30 Sep 2022
with a local HTML file
For webpages returning an error when requesting programmatically, download first the page from browser as Web Page,Complete
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
### Global Variables
file = '/path/to/file/page-saved.html'
v = True # verbose
# parsers to use are either html.parser or lxml (which requires install pip install lxml)
# soup = BeautifulSoup(open(file), 'html.parser')
soup = BeautifulSoup(open(file), 'lxml')
links = []
# adjust blacklist as needed - keywords to filter URLs out
blacklist = [
'', # add the domain of the website to remove all internal links
# href=True limits to 'a' tags with href in them
all_href_tags = soup.find_all('a', href=True)
if v:
print(all_href_tags) # for verification
print(f"\nlen all_href_tags: {len(all_href_tags)}\n")
for tag in all_href_tags:
if v:
print(f"--- {type(tag)}")
print(f"--- {tag=}")
print(f"--- {tag.attrs=}")
# if tag.attrs['href']:
link = tag.attrs['href']
if v:
if not any(ele in link for ele in blacklist):
if link.endswith('/'):
link = link[:-1]
if v:
for link in links:
print(f"\n{len(links)} links found.\n")