Get all links in a HTML page

Table of Contents

30 Sep 2022

with a local HTML file

For webpages returning an error when requesting programmatically, download first the page from browser as Web Page,Complete then:

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re

### Global Variables

file = '/path/to/file/page-saved.html'

v = True # verbose

# parsers to use are either html.parser or lxml (which requires install pip install lxml)
# soup = BeautifulSoup(open(file), 'html.parser')
soup = BeautifulSoup(open(file), 'lxml')

links = []

# adjust blacklist as needed - keywords to filter URLs out
blacklist = [
    'domain.com', # add the domain of the website to remove all internal links
    'javascript',
    'twitter',
    'facebook',
    'youtube',
    'linkedin',
    'tiktok',
]

# href=True limits to 'a' tags with href in them
all_href_tags = soup.find_all('a', href=True)

if v:
    print(all_href_tags) # for verification
    print(f"\nlen all_href_tags: {len(all_href_tags)}\n")

for tag in all_href_tags:
    if v:
        print(f"--- {type(tag)}")
        print(f"--- {tag=}")
        print(f"--- {tag.attrs=}")
    # if tag.attrs['href']:
    link = tag.attrs['href']
    if v:
        print(f"{link=}")
    if not any(ele in link for ele in blacklist):
        if link.endswith('/'):
            link = link[:-1]
        links.append(link)
    if v:
        print()

print(f"\nlinks:\n")
for link in links:
    print(f"{link}")
print(f"\n{len(links)} links found.\n")

links

social