Markdownee

in the AI age, everything should have its .md version

19 Apr 2025

Branching out my simple converter script to a full-fledged project.

The goal is to create a powerful Markdown parser, testing several approaches along the way, that can handle various formats and convert them into the cleanest Markdown possible.
Possibly leveraging AI for post-processing.

This project is important as it is the foundation to feed clean data into AI workflows.

Github

Project code here:

Libraries

Here are some libraries to test.

Microsoft's MarkItDown

Currently implemented.

QuivrHQ/MegaParse

Pandoc - CLI document converter

https://pandoc.org/

Pandoc is a universal document converter that can convert files between various markup formats, including Markdown, HTML, LaTeX, and more. It's useful for converting documents from one format to another, and it also includes features like automatic citations and bibliographies, as well as customization options through templates and filters.

ppt2desc: Convert PowerPoint files into semantically rich text using vision language models

MinerU: A High-Quality PDF-to-Markdown/JSON Converter Worth Checking Out

markpdfdown: A high-quality PDF to Markdown tool based on large language model visual recognition. 一款基于大模型视觉识别的高质量PDF转Markdown工具

27 Jul 2025

E2M API, converting everything to markdown (LLM-friendly Format).

Google LangExtract

Not extracting directly to Markdown, but might be worth exploring for trustworthy higher quality structured outputs in JSONL, which could be converted later to Markdown.

Dots OCR - Multilingual Document Text Extraction

A state-of-the-art image/pdf-to-markdown vision language model for intelligent document processing.

# Code

website2md.py

14 Jan 2026

In order to feed prospects' entire website into an LLM, I need to download the text content of the full website in a single Markdown (.md) file.

This script version works, but downloads each web page to its own .md file (in folder structure, mirroring the website). Need to add logic to concatenate in a single file, while incrementing the Markdown hierarchy.

#!/usr/bin/env python3

####################
# Download local copy of a website

import asyncio
import sys
import os
import re
from urllib.parse import urljoin, urlparse

from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# MAIN

START_URL = "https://website-to-crawl.com"
OUT_DIR = "/path/to/output/folder"
MAX_PAGES = 5000  # safety limit

count = 0
count_total = 0


def clean_filename(url: str) -> str:
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    if not path:
        path = "index"
    path = re.sub(r"[^\w\-\/]", "_", path)
    return path.rstrip("/") + ".md"


def extract_links(html: str, base_url: str, domain: str) -> set[str]:
    soup = BeautifulSoup(html, "html.parser")
    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith(("mailto:", "tel:", "#", "javascript:")):
            continue

        abs_url = urljoin(base_url, href)
        parsed = urlparse(abs_url)

        if parsed.netloc == domain:
            links.add(parsed.scheme + "://" + parsed.netloc + parsed.path)

    return links


def extract_main_content(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")

    # Remove junk tags
    for tag in soup(["script", "style", "noscript", "svg", "iframe"]):
        tag.decompose()

    # Remove header/footer/nav elements
    for tag in soup.find_all(["header", "footer", "nav"]):
        tag.decompose()

    # Remove common header/footer classes and IDs
    for selector in [
        {"id": re.compile(r"(header|footer|nav|menu|sidebar|cookie|banner)", re.I)},
        {"class_": re.compile(r"(header|footer|nav|menu|sidebar|cookie|banner|top-bar|bottom-bar)", re.I)},
        {"role": re.compile(r"(banner|navigation|contentinfo)", re.I)},
    ]:
        for tag in soup.find_all(**selector):
            tag.decompose()

    # Remove elements with common footer/header data attributes
    for tag in soup.find_all(attrs={"data-section": re.compile(r"(header|footer)", re.I)}):
        tag.decompose()

    # Prefer semantic main content containers
    main_content = (
        soup.find("main") or
        soup.find("article") or
        soup.find(id=re.compile(r"(main|content|primary)", re.I)) or
        soup.find(class_=re.compile(r"(main-content|page-content|entry-content|post-content)", re.I)) or
        soup.find(role="main") or
        soup.body
    )

    if not main_content:
        return ""

    # Final cleanup: remove any remaining nav-like elements inside main
    for tag in main_content.find_all(class_=re.compile(r"(breadcrumb|pagination|share|social)", re.I)):
        tag.decompose()

    return str(main_content)


async def crawl():
    global count, count_total

    os.makedirs(OUT_DIR, exist_ok=True)

    visited = set()
    to_visit = [START_URL]
    domain = urlparse(START_URL).netloc

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        while to_visit and len(visited) < MAX_PAGES:
            url = to_visit.pop(0)
            if url in visited:
                continue

            count_total += 1

            print(f"→ Fetching #{count_total}: {url}")
            visited.add(url)

            try:
                await page.goto(url, wait_until="networkidle", timeout=60000)
                html = await page.content()
            except Exception as e:
                print(f"  ! Failed: {e}")
                continue

            main_html = extract_main_content(html)
            markdown = md(main_html, heading_style="ATX", strip=["a"])  # Optional: strip links

            # Clean up excessive whitespace
            markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip()

            outfile = os.path.join(OUT_DIR, clean_filename(url))
            os.makedirs(os.path.dirname(outfile), exist_ok=True)

            with open(outfile, "w", encoding="utf-8") as f:
                f.write(f"<!-- Source: {url} -->\n\n{markdown}")

            count += 1

            new_links = extract_links(html, url, domain)
            for link in new_links:
                if link not in visited:
                    to_visit.append(link)

        await browser.close()

    print(f"\n✅ Done! Saved {count}/{count_total} pages to {OUT_DIR}")


asyncio.run(crawl())

links

social