19 Apr 2025
Branching out my simple converter script to a full-fledged project.
The goal is to create a powerful Markdown parser, testing several approaches along the way, that can handle various formats and convert them into the cleanest Markdown possible.
Possibly leveraging AI for post-processing.
This project is important as it is the foundation to feed clean data into AI workflows.
Github
Project code here:

Libraries
Here are some libraries to test.
Microsoft's MarkItDown
Currently implemented.

QuivrHQ/MegaParse

Pandoc - CLI document converter
https://pandoc.org/

Pandoc is a universal document converter that can convert files between various markup formats, including Markdown, HTML, LaTeX, and more. It's useful for converting documents from one format to another, and it also includes features like automatic citations and bibliographies, as well as customization options through templates and filters.
ppt2desc: Convert PowerPoint files into semantically rich text using vision language models

MinerU: A High-Quality PDF-to-Markdown/JSON Converter Worth Checking Out

markpdfdown: A high-quality PDF to Markdown tool based on large language model visual recognition. 一款基于大模型视觉识别的高质量PDF转Markdown工具
27 Jul 2025

E2M API, converting everything to markdown (LLM-friendly Format).

Google LangExtract
Not extracting directly to Markdown, but might be worth exploring for trustworthy higher quality structured outputs in JSONL, which could be converted later to Markdown.

Dots OCR - Multilingual Document Text Extraction
A state-of-the-art image/pdf-to-markdown vision language model for intelligent document processing.

# Code
website2md.py
14 Jan 2026
In order to feed prospects' entire website into an LLM, I need to download the text content of the full website in a single Markdown (.md) file.
This script version works, but downloads each web page to its own .md file (in folder structure, mirroring the website). Need to add logic to concatenate in a single file, while incrementing the Markdown hierarchy.
#!/usr/bin/env python3
####################
# Download local copy of a website
import asyncio
import sys
import os
import re
from urllib.parse import urljoin, urlparse
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from markdownify import markdownify as md
# MAIN
START_URL = "https://website-to-crawl.com"
OUT_DIR = "/path/to/output/folder"
MAX_PAGES = 5000 # safety limit
count = 0
count_total = 0
def clean_filename(url: str) -> str:
parsed = urlparse(url)
path = parsed.path.strip("/")
if not path:
path = "index"
path = re.sub(r"[^\w\-\/]", "_", path)
return path.rstrip("/") + ".md"
def extract_links(html: str, base_url: str, domain: str) -> set[str]:
soup = BeautifulSoup(html, "html.parser")
links = set()
for a in soup.find_all("a", href=True):
href = a["href"].strip()
if href.startswith(("mailto:", "tel:", "#", "javascript:")):
continue
abs_url = urljoin(base_url, href)
parsed = urlparse(abs_url)
if parsed.netloc == domain:
links.add(parsed.scheme + "://" + parsed.netloc + parsed.path)
return links
def extract_main_content(html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
# Remove junk tags
for tag in soup(["script", "style", "noscript", "svg", "iframe"]):
tag.decompose()
# Remove header/footer/nav elements
for tag in soup.find_all(["header", "footer", "nav"]):
tag.decompose()
# Remove common header/footer classes and IDs
for selector in [
{"id": re.compile(r"(header|footer|nav|menu|sidebar|cookie|banner)", re.I)},
{"class_": re.compile(r"(header|footer|nav|menu|sidebar|cookie|banner|top-bar|bottom-bar)", re.I)},
{"role": re.compile(r"(banner|navigation|contentinfo)", re.I)},
]:
for tag in soup.find_all(**selector):
tag.decompose()
# Remove elements with common footer/header data attributes
for tag in soup.find_all(attrs={"data-section": re.compile(r"(header|footer)", re.I)}):
tag.decompose()
# Prefer semantic main content containers
main_content = (
soup.find("main") or
soup.find("article") or
soup.find(id=re.compile(r"(main|content|primary)", re.I)) or
soup.find(class_=re.compile(r"(main-content|page-content|entry-content|post-content)", re.I)) or
soup.find(role="main") or
soup.body
)
if not main_content:
return ""
# Final cleanup: remove any remaining nav-like elements inside main
for tag in main_content.find_all(class_=re.compile(r"(breadcrumb|pagination|share|social)", re.I)):
tag.decompose()
return str(main_content)
async def crawl():
global count, count_total
os.makedirs(OUT_DIR, exist_ok=True)
visited = set()
to_visit = [START_URL]
domain = urlparse(START_URL).netloc
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
while to_visit and len(visited) < MAX_PAGES:
url = to_visit.pop(0)
if url in visited:
continue
count_total += 1
print(f"→ Fetching #{count_total}: {url}")
visited.add(url)
try:
await page.goto(url, wait_until="networkidle", timeout=60000)
html = await page.content()
except Exception as e:
print(f" ! Failed: {e}")
continue
main_html = extract_main_content(html)
markdown = md(main_html, heading_style="ATX", strip=["a"]) # Optional: strip links
# Clean up excessive whitespace
markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip()
outfile = os.path.join(OUT_DIR, clean_filename(url))
os.makedirs(os.path.dirname(outfile), exist_ok=True)
with open(outfile, "w", encoding="utf-8") as f:
f.write(f"<!-- Source: {url} -->\n\n{markdown}")
count += 1
new_links = extract_links(html, url, domain)
for link in new_links:
if link not in visited:
to_visit.append(link)
await browser.close()
print(f"\n✅ Done! Saved {count}/{count_total} pages to {OUT_DIR}")
asyncio.run(crawl())