Table of Contents
30 Sep 2022
with a local HTML file
For webpages returning an error when requesting programmatically, download first the page from browser as Web Page,Complete then:  
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
### Global Variables
file = '/path/to/file/page-saved.html'
v = True # verbose
# parsers to use are either html.parser or lxml (which requires install pip install lxml)
# soup = BeautifulSoup(open(file), 'html.parser')
soup = BeautifulSoup(open(file), 'lxml')
links = []
# adjust blacklist as needed - keywords to filter URLs out
blacklist = [
    'domain.com', # add the domain of the website to remove all internal links
    'javascript',
    'twitter',
    'facebook',
    'youtube',
    'linkedin',
    'tiktok',
]
# href=True limits to 'a' tags with href in them
all_href_tags = soup.find_all('a', href=True)
if v:
    print(all_href_tags) # for verification
    print(f"\nlen all_href_tags: {len(all_href_tags)}\n")
for tag in all_href_tags:
    if v:
        print(f"--- {type(tag)}")
        print(f"--- {tag=}")
        print(f"--- {tag.attrs=}")
    # if tag.attrs['href']:
    link = tag.attrs['href']
    if v:
        print(f"{link=}")
    if not any(ele in link for ele in blacklist):
        if link.endswith('/'):
            link = link[:-1]
        links.append(link)
    if v:
        print()
print(f"\nlinks:\n")
for link in links:
    print(f"{link}")
print(f"\n{len(links)} links found.\n")