Fetch website metadata from URL

04 Sep 2022

Requires downloading Chrome driver from:

ChromeDriver - WebDriver for Chrome - Downloads

google.com

https://sites.google.com/chromium.org/driver/downloads

Important: ensure that the driver version matches the version installed locally.

from selenium.webdriver import Chrome, ChromeOptions
from bs4 import BeautifulSoup

chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

def metadata_from_url(url):
    s = Service('/path/to/chromedriver')
    web = Chrome(service=s,options=chrome_options)
    web.get(url)
    xml = web.page_source
    web.quit()
    soup = BeautifulSoup(xml, features='html.parser')
    metas = [x for x in soup.find_all('meta') if x.get('property')]
    return [{x.get('property'): x.get('content')} for x in metas]

result = metadata_from_url('https://www.website/url/') # list of dicts

links

social