How to get a website's metadata with Python

15 Sep 2022

from selenium.webdriver import Chrome, ChromeOptions
from bs4 import BeautifulSoup

chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

def metadata_from_url(url):
    # need to download browser first
    # for Chrome, see: https://sites.google.com/chromium.org/driver/downloads
    s = Service('/Users/xxxx/path/to/local/driver')
    web = Chrome(service=s,options=chrome_options)
    web.get(url)
    xml = web.page_source
    web.quit()
    soup = BeautifulSoup(xml, features='html.parser')
    metas = [x for x in soup.find_all('meta') if x.get('property')]
    return [{x.get('property'): x.get('content')} for x in metas]

result = metadata_from_url('https://url-to-get-metadata-from.com') # list of dicts

print(f"\n{result=}\n")

for res in result:
    print(res)

ex. with

links

social