Table of Contents
30 Sep 2022
with a local HTML file
For webpages returning an error when requesting programmatically, download first the page from browser as Web Page,Complete
then:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
### Global Variables
file = '/path/to/file/page-saved.html'
v = True # verbose
# parsers to use are either html.parser or lxml (which requires install pip install lxml)
# soup = BeautifulSoup(open(file), 'html.parser')
soup = BeautifulSoup(open(file), 'lxml')
links = []
# adjust blacklist as needed - keywords to filter URLs out
blacklist = [
'domain.com', # add the domain of the website to remove all internal links
'javascript',
'twitter',
'facebook',
'youtube',
'linkedin',
'tiktok',
]
# href=True limits to 'a' tags with href in them
all_href_tags = soup.find_all('a', href=True)
if v:
print(all_href_tags) # for verification
print(f"\nlen all_href_tags: {len(all_href_tags)}\n")
for tag in all_href_tags:
if v:
print(f"--- {type(tag)}")
print(f"--- {tag=}")
print(f"--- {tag.attrs=}")
# if tag.attrs['href']:
link = tag.attrs['href']
if v:
print(f"{link=}")
if not any(ele in link for ele in blacklist):
if link.endswith('/'):
link = link[:-1]
links.append(link)
if v:
print()
print(f"\nlinks:\n")
for link in links:
print(f"{link}")
print(f"\n{len(links)} links found.\n")