Code

28 Mar 2025

CLI

Arguments

whisperx --h

Transcribe

whisperx "/Users/nic/dl/yt/pharma-demo/Mayo Clinic Q&A podcast： A vaccine milestone.mp4" \
  --model large-v2 \
  --align_model WAV2VEC2_ASR_LARGE_LV60K_960H \
  --batch_size 16 \
  --threads 8 \
  --compute_type int8 \
  --max_line_width 40 \
  --max_line_count 1 \
  --language en \
  --task transcribe
  --verbose True \
  --print_progress True \

Translate (only TO English)

--language is not required (auto-detection by default, though increases inference time) - when set, set it to the source language for translation TO English.

whisperx "/Users/nic/dl/yt/pharma-demo/Mayo Clinic Q&A podcast： A vaccine milestone.mp4" \
  --model large-v2 \
  --align_model WAV2VEC2_ASR_LARGE_LV60K_960H \
  --batch_size 16 \
  --threads 8 \
  --compute_type int8 \
  --max_line_width 40 \
  --max_line_count 1 \
  --language en \
  --task translate

Arguments

All

--audio # Positional arg: audio file(s) to transcribe
--model # Name of the Whisper model to use
--model_cache_only # If True, only use cached models (no downloading)
--model_dir # Directory to save model files; defaults to ~/.cache/whisper
--device # Device to use for PyTorch inference (cpu or cuda)
--device_index # Device index for multi-GPU setups
--batch_size # Preferred batch size for inference
--compute_type # Precision for computation: float16, float32, int8
--output_dir, -o # Directory to save the outputs
--output_format, -f # Format of the output file: srt, vtt, txt, etc.
--verbose # Whether to print progress and debug messages
--task # transcribe (same language) or translate (to English)
--language # Language spoken in the audio, or None for detection
--align_model # Name of phoneme-level ASR model used for alignment
--interpolate_method # How to timestamp non-aligned words: nearest, linear, or ignore
--no_align # If set, do not perform phoneme alignment
--return_char_alignments # If set, return character-level alignments in output JSON
--vad_method # VAD method to use: pyannote or silero
--vad_onset # VAD onset threshold, lower if speech is missed
--vad_offset # VAD offset threshold, lower if speech is missed
--chunk_size # Chunk size (in seconds) for merging VAD segments
--diarize # If set, apply diarization for speaker labels
--min_speakers # Minimum number of speakers to expect
--max_speakers # Maximum number of speakers to expect
--temperature # Temperature to use for sampling (0 = deterministic)
--best_of # Number of candidates when sampling with non-zero temperature
--beam_size # Number of beams in beam search (used when temperature = 0)
--patience # Beam search patience; 1.0 = standard beam search
--length_penalty # Token length penalty (alpha) during decoding
--suppress_tokens # Token IDs to suppress during sampling; '-1' suppresses specials
--suppress_numerals # If set, suppress numerals and currency symbols
--initial_prompt # Optional text prompt for the first decoding window
--condition_on_previous_text # If True, use previous text as prompt for the next window
--fp16 # If True, perform inference in float16 (default: True)
--temperature_increment_on_fallback # Increment temperature when fallback decoding is triggered
--compression_ratio_threshold # If gzip compression ratio > this, treat decoding as failed
--logprob_threshold # If avg log prob < this, treat decoding as failed
--no_speech_threshold # If <|nospeech|> prob > this AND decoding failed, mark as silence
--max_line_width # Max characters per line in output (not valid with --no_align)
--max_line_count # Max lines per subtitle segment (not valid with --no_align)
--highlight_words # If set, underline words as spoken in srt/vtt (not valid with --no_align)
--segment_resolution # Resolution of segments: sentence or chunk (not valid with --no_align)
--threads # Number of CPU threads for inference
--hf_token # Hugging Face access token for gated models (e.g. PyAnnote)
--print_progress # If True, show progress in transcribe() and align()

Alphabetically

--align_model # Name of phoneme-level ASR model used for alignment
--audio # Positional arg: audio file(s) to transcribe
--batch_size # Batch size for inference
--beam_size # Number of beams in beam search (when temperature=0)
--best_of # Number of candidates when sampling (temperature>0)
--chunk_size # Size for merging VAD segments (in seconds)
--compression_ratio_threshold # If gzip compression ratio > this, decoding is considered failed
--condition_on_previous_text # If True, use prior segment's output as prompt for next
--compute_type # Precision type for computation (float16, float32, int8)
--device # Inference device (cpu or cuda)
--device_index # Device index (for multi-GPU setups)
--diarize # If set, perform speaker diarization
--fp16 # Whether to perform inference in fp16
--hf_token # Hugging Face token to access gated models
--highlight_words # Underline each word as it's spoken in srt/vtt
--initial_prompt # Optional text prompt for the first segment
--interpolate_method # How to assign timestamps to non-aligned words
--language # Language spoken in the audio (or None for detection)
--length_penalty # Token length penalty during decoding
--logprob_threshold # If avg log prob < this, decoding is considered failed
--max_line_count # Max number of lines in a subtitle segment
--max_line_width # Max characters per line in subtitle output
--max_speakers # Max number of speakers for diarization
--min_speakers # Min number of speakers for diarization
--model # Name of Whisper model to use
--model_cache_only # If True, only use cached models (no downloading)
--model_dir # Directory to cache/save model files
--no_align # Skip phoneme alignment
--no_speech_threshold # Threshold to consider a segment as silence
--output_dir, -o # Directory to save outputs
--output_format, -f # Output format(s): srt, vtt, txt, etc.
--patience # Patience in beam search (higher = more thorough)
--print_progress # If True, show progress during transcribe/align
--return_char_alignments # Include character-level alignments in output
--segment_resolution # Resolution of segments (sentence or chunk)
--suppress_numerals # Suppress numbers and currency symbols in output
--suppress_tokens # List of token IDs to suppress during sampling
--task # transcribe (same-language) or translate (to English)
--temperature # Sampling temperature (0 = deterministic)
--temperature_increment_on_fallback # Temp increment if decoding fails
--threads # Number of threads used for CPU inference
--vad_method # VAD (voice activity detection) method
--vad_offset # Offset threshold for VAD
--vad_onset # Onset threshold for VAD
--verbose # If True, print debug and progress info

beamsize vs best_of

Here’s a deeper dive into beam_size and best_of in Whisper:

--beam_size

•   Used when beam search decoding is enabled.
•   Beam search explores multiple possible transcription paths.
•   beam_size defines how many paths to keep track of at each step.
•   Higher values = better accuracy (it tries more combinations), but slower and more memory-intensive.

Example:
• beam_size=1 → greedy decoding (only best guess at each step).
• beam_size=5 → keeps 5 best paths at each step to choose from.

--best_of

•   Used only when not using beam search (i.e., greedy decoding).
•   For each segment, Whisper samples multiple decoding paths (best_of candidates), and picks the best one based on score.
•   Improves quality over pure greedy decoding, but still faster than beam search.

Example:
• best_of=5 → generate 5 different possible transcriptions, keep the highest scoring.

Key Difference
• Use beam_size for structured, accurate decoding (slower, more powerful).
• Use best_of for slightly better greedy decoding (faster, lighter).

Note: You typically use one or the other — they’re not combined.

compute_type

--compute_type controls the precision and performance of model inference. Here’s the breakdown:

float16
• 16-bit floating point (half-precision).
• Fastest on supported GPUs (like NVIDIA with Tensor Cores).
• Slightly less accurate than float32, but usually good enough.
• Best choice for speed on GPU.

float32
• 32-bit floating point (full-precision).
• Most accurate, but slowest and uses the most memory.
• Use on CPU or when debugging precision issues.

int8
• 8-bit integer (quantized).
• Very fast and low memory, but lowest accuracy.
• Great for running on CPU or low-resource devices.
• May degrade transcription quality slightly.

Best choice?
• On GPU: use float16 (best speed vs. quality).
• On CPU: try int8 (speed/memory), fall back to float32 for better accuracy.

VAD

vad_method

VAD_METHOD: "pyannote" # or "silero"
• Voice Activity Detection (VAD) method to use.
• Determines how the audio is split into segments for processing.
• "pyannote" is a deep learning-based method, while "silero" is a lightweight model.
• Choose based on your needs:
- "pyannote" for complex audio with overlapping speech.
- "silero" for simpler, real-time applications.

pyannote:
• Uses a deep neural network for speech segmentation.
• Offers fine-grained, robust detection even in complex, overlapping speech environments.
• Typically requires more computational resources and may run slower.

silero:
• A lightweight VAD model optimized for speed and low resource use.
• Well-suited for real-time applications with simpler speech patterns.
• May be less precise in challenging audio conditions compared to pyannote.

Benchmarking:
- pyannote: 0m47s video converted in 1m 2s (ratio: 0.76x).
- silero:

VAD_ONSET: "0.500"
• This is the delay (in seconds) added before marking the start of a speech segment.
• A value around 0.5 sec is typical to avoid cutting off initial speech. Lower values may start segments too early; higher values might delay capture.

VAD_OFFSET: "0.363"
• This is the delay (in seconds) after speech ends before finalizing the segment.
• Values near 0.36 sec help capture trailing words without overlap. Adjust if segments seem too short or if words are clipped.

chunk_size

CHUNK_SIZE: "30"
• Specifies the length (in seconds) of audio processed at once.
• 30 seconds is a balanced default for processing time vs. accuracy. Smaller chunks can yield more precise boundaries, while larger chunks can speed up processing.

segment_resolution

--segment_resolution", type=str, default="sentence", choices=["sentence", "chunk"], help="(not possible with --no_align) the maximum number of characters in a line before breaking the line"

Configuration benchmarks

Speed

TEST VIDEO (47s play time) - processing time with WhisperX based on various configurations

All using align_model WAV2VEC2_ASR_LARGE_LV60K_960H

batch_size 16, threads 8, compute_type int8, model large-v2 = 47s
batch_size 16, threads 16, compute_type int8, model large-v2 = 57s
batch_size 16, threads 8, compute_type int8, model large-v3 = 39s
batch_size 16, threads 8, compute_type int8, model large-v3 length_penalty 1.2 = 39s
batch_size 32, threads 8, compute_type int8, model large-v3 = 38s
batch_size 32, threads 16, compute_type int8, model large-v3 = 1m3s
batch_size 16, threads 8, compute_type float32, model large-v3 = 1m1
batch_size 16, threads 8, compute_type int8, model large-v3 = 40s
batch_size 16, threads 8, compute_type int8, model large-v2 = 39s
batch_size 16, threads 8, compute_type int8, model turbo = 25s
batch_size 16, threads 8, compute_type int8, model turbo, language en = 22s / TEST long file with that configuration

2.07x (0m 47s converted in 0m 23s) with model: turbo, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8
2.3x (0m 47s converted in 0m 20s) with model: turbo, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8 language: en

3.95x (24m 50s converted in 6m 17s) with model: turbo, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8 language: en
3.38x (24m 50s converted in 7m 21s) with model: medium.en, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8 language: en

Turbo faster than medium.en

Code

29 Mar 2025

After a lot of trial and error, I finally got it to work the way I wanted.

FINAL SOLUTION:
- generate only a .json output with word-level timestamps from whisperx
- post-process the .json output to create a new .srt file with a more reasonable number of words per segment
- create a clean .txt version of the SRT file (that can be used for translation)

def generate_en_srt(mp4_path):

    import os
    import subprocess

    output_dir = os.path.dirname(mp4_path)

    # WhisperX Configuration
    LANGUAGE = "en"
    VERBOSE = "False"
    MODEL = "turbo"
    MODEL_CACHE_ONLY = "False"
    MODEL_DIR = None
    DEVICE = "cpu"  # Changed from "cuda" to "cpu". Only "cpu" is available on my Mac Studio.
    DEVICE_INDEX = "0"
    ALIGN_MODEL = "WAV2VEC2_ASR_LARGE_LV60K_960H"
    BATCH_SIZE = "16"
    COMPUTE_TYPE = "int8"
    MAX_LINE_WIDTH = "45"
    MAX_LINE_COUNT = "1"
    TASK = "transcribe"
    INTERPOLATE_METHOD = "nearest"
    # NO_ALIGN = "False"
    # RETURN_CHAR_ALIGNMENTS = "False"
    VAD_METHOD = "pyannote" # or "silero" / # pyannote provides robust, precise segmentation for challenging audio (at higher computational cost) while silero is lighter and faster but may be less accurate in complex scenarios.
    VAD_ONSET = "0.500"
    VAD_OFFSET = "0.363"
    CHUNK_SIZE = "30"
    # DIARIZE = "False"
    # MIN_SPEAKERS = None
    # MAX_SPEAKERS = None
    TEMPERATURE = "0"
    BEST_OF = "5"
    BEAM_SIZE = "5"
    PATIENCE = "1.0"
    LENGTH_PENALTY = "1.0"
    SUPPRESS_TOKENS = "-1"
    SUPPRESS_NUMERALS = "False"
    # INITIAL_PROMPT = None
    # CONDITION_ON_PREVIOUS_TEXT = "False"
    # FP16 = "True"
    TEMPERATURE_INCREMENT_ON_FALLBACK = "0.2"
    COMPRESSION_RATIO_THRESHOLD = "2.4"
    LOGPROB_THRESHOLD = "-1.0"
    NO_SPEECH_THRESHOLD = "0.6"
    # HIGHLIGHT_WORDS = "False"
    SEGMENT_RESOLUTION = "chunk"
    THREADS = "8"
    # HF_TOKEN = None
    OUTPUT_FORMAT = "json" # "all" or "json" or "srt" / Output json-only and do post-processing to create a new .srt file with a more reasonable number of words per segment. Or output all formats to get also the clean .txt and delete unneeded files in post-processing.
    PRINT_PROGRESS = "False"

    cmd = [
        "whisperx",
        mp4_path,
        "--verbose", VERBOSE,
        "--model", MODEL,
        "--device", DEVICE,
        "--device_index", DEVICE_INDEX,
        # "--align_model", ALIGN_MODEL, # 250329-1534 removing to try to fix the overlapping segments issue
        "--batch_size", BATCH_SIZE,
        "--compute_type", COMPUTE_TYPE,
        "--max_line_width", MAX_LINE_WIDTH,
        "--max_line_count", MAX_LINE_COUNT,
        "--language", LANGUAGE,
        "--task", TASK,
        # "--interpolate_method", INTERPOLATE_METHOD, # 250329-1534 removing to try to fix the overlapping segments issue
        # "--no_align", NO_ALIGN,
        # "--return_char_alignments", RETURN_CHAR_ALIGNMENTS,
        # "--vad_method", VAD_METHOD, # 250329-1534 removing to try to fix the overlapping segments issue
        # "--vad_onset", VAD_ONSET, # 250329-1534 removing to try to fix the overlapping segments issue
        # "--vad_offset", VAD_OFFSET, # 250329-1534 removing to try to fix the overlapping segments issue
        # "--chunk_size", CHUNK_SIZE, # 250329-1534 removing to try to fix the overlapping segments issue
        # "--diarize", DIARIZE,
        # "--min_speakers", MIN_SPEAKERS,
        # "--max_speakers", MAX_SPEAKERS,
        # "--temperature", TEMPERATURE,
        # "--best_of", BEST_OF,
        # "--beam_size", BEAM_SIZE,
        # "--patience", PATIENCE,
        # "--length_penalty", LENGTH_PENALTY,
        # "--suppress_tokens", SUPPRESS_TOKENS,
        # "--initial_prompt", INITIAL_PROMPT,
        # "--condition_on_previous_text", CONDITION_ON_PREVIOUS_TEXT,
        # "--fp16", FP16,
        # "--temperature_increment_on_fallback", TEMPERATURE_INCREMENT_ON_FALLBACK,
        # "--compression_ratio_threshold", COMPRESSION_RATIO_THRESHOLD,
        # "--logprob_threshold", LOGPROB_THRESHOLD,
        # "--no_speech_threshold", NO_SPEECH_THRESHOLD,
        # "--highlight_words", HIGHLIGHT_WORDS,
        # "--segment_resolution", SEGMENT_RESOLUTION,
        # "--threads", THREADS,
        # "--hf_token", HF_TOKEN,
        # "--print_progress", PRINT_PROGRESS,
        "--output_dir", output_dir,
        "--output_format", OUTPUT_FORMAT
    ]

    # Add boolean flags without values
    # if MODEL_CACHE_ONLY.lower() == "true":
    #     cmd.append("--model_cache_only")
    # if VERBOSE.lower() == "true":
    #     cmd.append("--verbose")
    # if SUPPRESS_NUMERALS.lower() == "true":
    #     cmd.append("--suppress_numerals")

    # print(f"\n🔊 Generating 🇬🇧 English SRT for: {os.path.basename(mp4_path)}\n")
    subprocess.run(cmd, check=True)

    # Determine the output SRT path
    base_name = os.path.basename(mp4_path).rsplit(".", 1)[0]
    srt_path = os.path.join(output_dir, f"{base_name}.srt")
    json_path = os.path.join(output_dir, f"{base_name}.json")


    # Post-processing to create a new .srt file with a more reasonable number of words per segment using the json output from whisperx
    with open(json_path, "r") as f:
        data = json.load(f)

        # data["segments"] has a list of segments, each with "words" that have individual timestamps.
        new_segments = []
        max_words_per_segment = 10

        for seg in data["segments"]:
            words = seg["words"]
            current_chunk = []
            for word_info in words:
                current_chunk.append(word_info)
                if len(current_chunk) >= max_words_per_segment:
                    # finalize chunk
                    start_ts = current_chunk[0]["start"]
                    end_ts = current_chunk[-1]["end"]
                    text = " ".join([w["word"] for w in current_chunk])
                    new_segments.append((start_ts, end_ts, text))
                    current_chunk = []

            # leftover words in this segment
            if current_chunk:
                start_ts = current_chunk[0]["start"]
                end_ts = current_chunk[-1]["end"]
                text = " ".join([w["word"] for w in current_chunk])
                new_segments.append((start_ts, end_ts, text))

        # now write new_segments to SRT format:
        def srt_time(sec):
            """Convert float seconds to SRT time format (HH:MM:SS,mmm)"""
            hours = int(sec // 3600)
            minutes = int((sec % 3600) // 60)
            seconds = int(sec % 60)
            milliseconds = int((sec * 1000) % 1000)
            return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

        with open(srt_path, "w") as srt:
            for i, (start, end, text) in enumerate(new_segments, start=1):
                srt.write(f"{i}\n")
                srt.write(f"{srt_time(start)} --> {srt_time(end)}\n")
                srt.write(text.strip() + "\n\n")

        print(f"\n✅ Generated SRT file: {srt_path}\n")


    # Create a clean .txt version of the SRT file
    txt_path = os.path.join(output_dir, f"{base_name}.txt")

    try:
        with open(srt_path, "r") as srt_file, open(txt_path, "w") as txt_file:
            for line in srt_file:
                line = line.strip()
                # Skip empty lines, lines with timestamps (containing '-->'), and lines starting with digits (subtitle numbers)
                if line and not line.startswith(tuple('0123456789')) and '-->' not in line:
                    txt_file.write(line + "\n")

        print(f"✅ Generated clean TXT file: {txt_path}")
    except Exception as e:
        print(f"❌ Error creating TXT file: {str(e)}")



    # Delete the .tsv and .vtt files created in the same folder
    base_path = os.path.join(output_dir, base_name)
    tsv_path = f"{base_path}.tsv"
    vtt_path = f"{base_path}.vtt"

    if os.path.exists(tsv_path):
        try:
            os.remove(tsv_path)
            # print(f"🗑️ Deleted TSV file: {tsv_path}")
        except Exception as e:
            print(f"❌ Error deleting TSV file: {str(e)}")

    if os.path.exists(vtt_path):
        try:
            os.remove(vtt_path)
            # print(f"🗑️ Deleted VTT file: {vtt_path}")
        except Exception as e:
            print(f"❌ Error deleting VTT file: {str(e)}")

    return srt_path

Code

CLI

Arguments

Transcribe

Translate (only TO English)

Arguments

All

Alphabetically

beamsize vs best_of

--beam_size

--best_of

compute_type

VAD

vad_method

chunk_size

segment_resolution

Configuration benchmarks

Speed

TEST VIDEO (47s play time) - processing time with WhisperX based on various configurations

Code

links

social