Get all links in a HTML page

Table of Contents

30 Sep 2022

with a local HTML file

For webpages returning an error when requesting programmatically, download first the page from browser as Web Page,Complete then:

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re

### Global Variables

file = '/path/to/file/page-saved.html'

v = True # verbose

# parsers to use are either html.parser or lxml (which requires install pip install lxml)
# soup = BeautifulSoup(open(file), 'html.parser')
soup = BeautifulSoup(open(file), 'lxml')

links = []

# adjust blacklist as needed - keywords to filter URLs out
blacklist = [
    '', # add the domain of the website to remove all internal links

# href=True limits to 'a' tags with href in them
all_href_tags = soup.find_all('a', href=True)

if v:
    print(all_href_tags) # for verification
    print(f"\nlen all_href_tags: {len(all_href_tags)}\n")

for tag in all_href_tags:
    if v:
        print(f"--- {type(tag)}")
        print(f"--- {tag=}")
        print(f"--- {tag.attrs=}")
    # if tag.attrs['href']:
    link = tag.attrs['href']
    if v:
    if not any(ele in link for ele in blacklist):
        if link.endswith('/'):
            link = link[:-1]
    if v:

for link in links:
print(f"\n{len(links)} links found.\n")