Code
28 Mar 2025
CLI
Arguments
whisperx --h
Transcribe
whisperx "/Users/nic/dl/yt/pharma-demo/Mayo Clinic Q&A podcast: A vaccine milestone.mp4" \
--model large-v2 \
--align_model WAV2VEC2_ASR_LARGE_LV60K_960H \
--batch_size 16 \
--threads 8 \
--compute_type int8 \
--max_line_width 40 \
--max_line_count 1 \
--language en \
--task transcribe
--verbose True \
--print_progress True \
Translate (only TO English)
--language
is not required (auto-detection by default, though increases inference time) - when set, set it to the source language for translation TO English.
whisperx "/Users/nic/dl/yt/pharma-demo/Mayo Clinic Q&A podcast: A vaccine milestone.mp4" \
--model large-v2 \
--align_model WAV2VEC2_ASR_LARGE_LV60K_960H \
--batch_size 16 \
--threads 8 \
--compute_type int8 \
--max_line_width 40 \
--max_line_count 1 \
--language en \
--task translate
Arguments
All
--audio # Positional arg: audio file(s) to transcribe
--model # Name of the Whisper model to use
--model_cache_only # If True, only use cached models (no downloading)
--model_dir # Directory to save model files; defaults to ~/.cache/whisper
--device # Device to use for PyTorch inference (cpu or cuda)
--device_index # Device index for multi-GPU setups
--batch_size # Preferred batch size for inference
--compute_type # Precision for computation: float16, float32, int8
--output_dir, -o # Directory to save the outputs
--output_format, -f # Format of the output file: srt, vtt, txt, etc.
--verbose # Whether to print progress and debug messages
--task # transcribe (same language) or translate (to English)
--language # Language spoken in the audio, or None for detection
--align_model # Name of phoneme-level ASR model used for alignment
--interpolate_method # How to timestamp non-aligned words: nearest, linear, or ignore
--no_align # If set, do not perform phoneme alignment
--return_char_alignments # If set, return character-level alignments in output JSON
--vad_method # VAD method to use: pyannote or silero
--vad_onset # VAD onset threshold, lower if speech is missed
--vad_offset # VAD offset threshold, lower if speech is missed
--chunk_size # Chunk size (in seconds) for merging VAD segments
--diarize # If set, apply diarization for speaker labels
--min_speakers # Minimum number of speakers to expect
--max_speakers # Maximum number of speakers to expect
--temperature # Temperature to use for sampling (0 = deterministic)
--best_of # Number of candidates when sampling with non-zero temperature
--beam_size # Number of beams in beam search (used when temperature = 0)
--patience # Beam search patience; 1.0 = standard beam search
--length_penalty # Token length penalty (alpha) during decoding
--suppress_tokens # Token IDs to suppress during sampling; '-1' suppresses specials
--suppress_numerals # If set, suppress numerals and currency symbols
--initial_prompt # Optional text prompt for the first decoding window
--condition_on_previous_text # If True, use previous text as prompt for the next window
--fp16 # If True, perform inference in float16 (default: True)
--temperature_increment_on_fallback # Increment temperature when fallback decoding is triggered
--compression_ratio_threshold # If gzip compression ratio > this, treat decoding as failed
--logprob_threshold # If avg log prob < this, treat decoding as failed
--no_speech_threshold # If <|nospeech|> prob > this AND decoding failed, mark as silence
--max_line_width # Max characters per line in output (not valid with --no_align)
--max_line_count # Max lines per subtitle segment (not valid with --no_align)
--highlight_words # If set, underline words as spoken in srt/vtt (not valid with --no_align)
--segment_resolution # Resolution of segments: sentence or chunk (not valid with --no_align)
--threads # Number of CPU threads for inference
--hf_token # Hugging Face access token for gated models (e.g. PyAnnote)
--print_progress # If True, show progress in transcribe() and align()
Alphabetically
--align_model # Name of phoneme-level ASR model used for alignment
--audio # Positional arg: audio file(s) to transcribe
--batch_size # Batch size for inference
--beam_size # Number of beams in beam search (when temperature=0)
--best_of # Number of candidates when sampling (temperature>0)
--chunk_size # Size for merging VAD segments (in seconds)
--compression_ratio_threshold # If gzip compression ratio > this, decoding is considered failed
--condition_on_previous_text # If True, use prior segment's output as prompt for next
--compute_type # Precision type for computation (float16, float32, int8)
--device # Inference device (cpu or cuda)
--device_index # Device index (for multi-GPU setups)
--diarize # If set, perform speaker diarization
--fp16 # Whether to perform inference in fp16
--hf_token # Hugging Face token to access gated models
--highlight_words # Underline each word as it's spoken in srt/vtt
--initial_prompt # Optional text prompt for the first segment
--interpolate_method # How to assign timestamps to non-aligned words
--language # Language spoken in the audio (or None for detection)
--length_penalty # Token length penalty during decoding
--logprob_threshold # If avg log prob < this, decoding is considered failed
--max_line_count # Max number of lines in a subtitle segment
--max_line_width # Max characters per line in subtitle output
--max_speakers # Max number of speakers for diarization
--min_speakers # Min number of speakers for diarization
--model # Name of Whisper model to use
--model_cache_only # If True, only use cached models (no downloading)
--model_dir # Directory to cache/save model files
--no_align # Skip phoneme alignment
--no_speech_threshold # Threshold to consider a segment as silence
--output_dir, -o # Directory to save outputs
--output_format, -f # Output format(s): srt, vtt, txt, etc.
--patience # Patience in beam search (higher = more thorough)
--print_progress # If True, show progress during transcribe/align
--return_char_alignments # Include character-level alignments in output
--segment_resolution # Resolution of segments (sentence or chunk)
--suppress_numerals # Suppress numbers and currency symbols in output
--suppress_tokens # List of token IDs to suppress during sampling
--task # transcribe (same-language) or translate (to English)
--temperature # Sampling temperature (0 = deterministic)
--temperature_increment_on_fallback # Temp increment if decoding fails
--threads # Number of threads used for CPU inference
--vad_method # VAD (voice activity detection) method
--vad_offset # Offset threshold for VAD
--vad_onset # Onset threshold for VAD
--verbose # If True, print debug and progress info
beamsize vs best_of
Here’s a deeper dive into beam_size and best_of in Whisper:
--beam_size
• Used when beam search decoding is enabled.
• Beam search explores multiple possible transcription paths.
• beam_size defines how many paths to keep track of at each step.
• Higher values = better accuracy (it tries more combinations), but slower and more memory-intensive.
Example:
• beam_size=1 → greedy decoding (only best guess at each step).
• beam_size=5 → keeps 5 best paths at each step to choose from.
--best_of
• Used only when not using beam search (i.e., greedy decoding).
• For each segment, Whisper samples multiple decoding paths (best_of candidates), and picks the best one based on score.
• Improves quality over pure greedy decoding, but still faster than beam search.
Example:
• best_of=5 → generate 5 different possible transcriptions, keep the highest scoring.
Key Difference
• Use beam_size for structured, accurate decoding (slower, more powerful).
• Use best_of for slightly better greedy decoding (faster, lighter).
Note: You typically use one or the other — they’re not combined.
compute_type
--compute_type controls the precision and performance of model inference. Here’s the breakdown:
float16
• 16-bit floating point (half-precision).
• Fastest on supported GPUs (like NVIDIA with Tensor Cores).
• Slightly less accurate than float32, but usually good enough.
• Best choice for speed on GPU.
float32
• 32-bit floating point (full-precision).
• Most accurate, but slowest and uses the most memory.
• Use on CPU or when debugging precision issues.
int8
• 8-bit integer (quantized).
• Very fast and low memory, but lowest accuracy.
• Great for running on CPU or low-resource devices.
• May degrade transcription quality slightly.
Best choice?
• On GPU: use float16 (best speed vs. quality).
• On CPU: try int8 (speed/memory), fall back to float32 for better accuracy.
VAD
vad_method
VAD_METHOD: "pyannote" # or "silero"
• Voice Activity Detection (VAD) method to use.
• Determines how the audio is split into segments for processing.
• "pyannote" is a deep learning-based method, while "silero" is a lightweight model.
• Choose based on your needs:
- "pyannote" for complex audio with overlapping speech.
- "silero" for simpler, real-time applications.
pyannote:
• Uses a deep neural network for speech segmentation.
• Offers fine-grained, robust detection even in complex, overlapping speech environments.
• Typically requires more computational resources and may run slower.
silero:
• A lightweight VAD model optimized for speed and low resource use.
• Well-suited for real-time applications with simpler speech patterns.
• May be less precise in challenging audio conditions compared to pyannote.
Benchmarking:
- pyannote: 0m47s video converted in 1m 2s (ratio: 0.76x).
- silero:
VAD_ONSET: "0.500"
• This is the delay (in seconds) added before marking the start of a speech segment.
• A value around 0.5 sec is typical to avoid cutting off initial speech. Lower values may start segments too early; higher values might delay capture.
VAD_OFFSET: "0.363"
• This is the delay (in seconds) after speech ends before finalizing the segment.
• Values near 0.36 sec help capture trailing words without overlap. Adjust if segments seem too short or if words are clipped.
chunk_size
CHUNK_SIZE: "30"
• Specifies the length (in seconds) of audio processed at once.
• 30 seconds is a balanced default for processing time vs. accuracy. Smaller chunks can yield more precise boundaries, while larger chunks can speed up processing.
segment_resolution
--segment_resolution", type=str, default="sentence", choices=["sentence", "chunk"], help="(not possible with --no_align) the maximum number of characters in a line before breaking the line"
Configuration benchmarks
Speed
TEST VIDEO (47s play time) - processing time with WhisperX based on various configurations
All using align_model WAV2VEC2_ASR_LARGE_LV60K_960H
- batch_size 16, threads 8, compute_type int8, model large-v2 = 47s
- batch_size 16, threads 16, compute_type int8, model large-v2 = 57s
- batch_size 16, threads 8, compute_type int8, model large-v3 = 39s
- batch_size 16, threads 8, compute_type int8, model large-v3 length_penalty 1.2 = 39s
- batch_size 32, threads 8, compute_type int8, model large-v3 = 38s
- batch_size 32, threads 16, compute_type int8, model large-v3 = 1m3s
- batch_size 16, threads 8, compute_type float32, model large-v3 = 1m1
- batch_size 16, threads 8, compute_type int8, model large-v3 = 40s
- batch_size 16, threads 8, compute_type int8, model large-v2 = 39s
- batch_size 16, threads 8, compute_type int8, model turbo = 25s
- batch_size 16, threads 8, compute_type int8, model turbo, language en = 22s / TEST long file with that configuration
2.07x (0m 47s converted in 0m 23s) with model: turbo, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8
2.3x (0m 47s converted in 0m 20s) with model: turbo, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8 language: en
3.95x (24m 50s converted in 6m 17s) with model: turbo, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8 language: en
3.38x (24m 50s converted in 7m 21s) with model: medium.en, align_model: WAV2VEC2_ASR_LARGE_LV60K_960H, batch_size: 16 threads: 8 compute_type: int8 language: en
Turbo faster than medium.en
Code
29 Mar 2025
After a lot of trial and error, I finally got it to work the way I wanted.
FINAL SOLUTION:
- generate only a .json output with word-level timestamps from whisperx
- post-process the .json output to create a new .srt file with a more reasonable number of words per segment
- create a clean .txt version of the SRT file (that can be used for translation)
def generate_en_srt(mp4_path):
import os
import subprocess
output_dir = os.path.dirname(mp4_path)
# WhisperX Configuration
LANGUAGE = "en"
VERBOSE = "False"
MODEL = "turbo"
MODEL_CACHE_ONLY = "False"
MODEL_DIR = None
DEVICE = "cpu" # Changed from "cuda" to "cpu". Only "cpu" is available on my Mac Studio.
DEVICE_INDEX = "0"
ALIGN_MODEL = "WAV2VEC2_ASR_LARGE_LV60K_960H"
BATCH_SIZE = "16"
COMPUTE_TYPE = "int8"
MAX_LINE_WIDTH = "45"
MAX_LINE_COUNT = "1"
TASK = "transcribe"
INTERPOLATE_METHOD = "nearest"
# NO_ALIGN = "False"
# RETURN_CHAR_ALIGNMENTS = "False"
VAD_METHOD = "pyannote" # or "silero" / # pyannote provides robust, precise segmentation for challenging audio (at higher computational cost) while silero is lighter and faster but may be less accurate in complex scenarios.
VAD_ONSET = "0.500"
VAD_OFFSET = "0.363"
CHUNK_SIZE = "30"
# DIARIZE = "False"
# MIN_SPEAKERS = None
# MAX_SPEAKERS = None
TEMPERATURE = "0"
BEST_OF = "5"
BEAM_SIZE = "5"
PATIENCE = "1.0"
LENGTH_PENALTY = "1.0"
SUPPRESS_TOKENS = "-1"
SUPPRESS_NUMERALS = "False"
# INITIAL_PROMPT = None
# CONDITION_ON_PREVIOUS_TEXT = "False"
# FP16 = "True"
TEMPERATURE_INCREMENT_ON_FALLBACK = "0.2"
COMPRESSION_RATIO_THRESHOLD = "2.4"
LOGPROB_THRESHOLD = "-1.0"
NO_SPEECH_THRESHOLD = "0.6"
# HIGHLIGHT_WORDS = "False"
SEGMENT_RESOLUTION = "chunk"
THREADS = "8"
# HF_TOKEN = None
OUTPUT_FORMAT = "json" # "all" or "json" or "srt" / Output json-only and do post-processing to create a new .srt file with a more reasonable number of words per segment. Or output all formats to get also the clean .txt and delete unneeded files in post-processing.
PRINT_PROGRESS = "False"
cmd = [
"whisperx",
mp4_path,
"--verbose", VERBOSE,
"--model", MODEL,
"--device", DEVICE,
"--device_index", DEVICE_INDEX,
# "--align_model", ALIGN_MODEL, # 250329-1534 removing to try to fix the overlapping segments issue
"--batch_size", BATCH_SIZE,
"--compute_type", COMPUTE_TYPE,
"--max_line_width", MAX_LINE_WIDTH,
"--max_line_count", MAX_LINE_COUNT,
"--language", LANGUAGE,
"--task", TASK,
# "--interpolate_method", INTERPOLATE_METHOD, # 250329-1534 removing to try to fix the overlapping segments issue
# "--no_align", NO_ALIGN,
# "--return_char_alignments", RETURN_CHAR_ALIGNMENTS,
# "--vad_method", VAD_METHOD, # 250329-1534 removing to try to fix the overlapping segments issue
# "--vad_onset", VAD_ONSET, # 250329-1534 removing to try to fix the overlapping segments issue
# "--vad_offset", VAD_OFFSET, # 250329-1534 removing to try to fix the overlapping segments issue
# "--chunk_size", CHUNK_SIZE, # 250329-1534 removing to try to fix the overlapping segments issue
# "--diarize", DIARIZE,
# "--min_speakers", MIN_SPEAKERS,
# "--max_speakers", MAX_SPEAKERS,
# "--temperature", TEMPERATURE,
# "--best_of", BEST_OF,
# "--beam_size", BEAM_SIZE,
# "--patience", PATIENCE,
# "--length_penalty", LENGTH_PENALTY,
# "--suppress_tokens", SUPPRESS_TOKENS,
# "--initial_prompt", INITIAL_PROMPT,
# "--condition_on_previous_text", CONDITION_ON_PREVIOUS_TEXT,
# "--fp16", FP16,
# "--temperature_increment_on_fallback", TEMPERATURE_INCREMENT_ON_FALLBACK,
# "--compression_ratio_threshold", COMPRESSION_RATIO_THRESHOLD,
# "--logprob_threshold", LOGPROB_THRESHOLD,
# "--no_speech_threshold", NO_SPEECH_THRESHOLD,
# "--highlight_words", HIGHLIGHT_WORDS,
# "--segment_resolution", SEGMENT_RESOLUTION,
# "--threads", THREADS,
# "--hf_token", HF_TOKEN,
# "--print_progress", PRINT_PROGRESS,
"--output_dir", output_dir,
"--output_format", OUTPUT_FORMAT
]
# Add boolean flags without values
# if MODEL_CACHE_ONLY.lower() == "true":
# cmd.append("--model_cache_only")
# if VERBOSE.lower() == "true":
# cmd.append("--verbose")
# if SUPPRESS_NUMERALS.lower() == "true":
# cmd.append("--suppress_numerals")
# print(f"\n🔊 Generating 🇬🇧 English SRT for: {os.path.basename(mp4_path)}\n")
subprocess.run(cmd, check=True)
# Determine the output SRT path
base_name = os.path.basename(mp4_path).rsplit(".", 1)[0]
srt_path = os.path.join(output_dir, f"{base_name}.srt")
json_path = os.path.join(output_dir, f"{base_name}.json")
# Post-processing to create a new .srt file with a more reasonable number of words per segment using the json output from whisperx
with open(json_path, "r") as f:
data = json.load(f)
# data["segments"] has a list of segments, each with "words" that have individual timestamps.
new_segments = []
max_words_per_segment = 10
for seg in data["segments"]:
words = seg["words"]
current_chunk = []
for word_info in words:
current_chunk.append(word_info)
if len(current_chunk) >= max_words_per_segment:
# finalize chunk
start_ts = current_chunk[0]["start"]
end_ts = current_chunk[-1]["end"]
text = " ".join([w["word"] for w in current_chunk])
new_segments.append((start_ts, end_ts, text))
current_chunk = []
# leftover words in this segment
if current_chunk:
start_ts = current_chunk[0]["start"]
end_ts = current_chunk[-1]["end"]
text = " ".join([w["word"] for w in current_chunk])
new_segments.append((start_ts, end_ts, text))
# now write new_segments to SRT format:
def srt_time(sec):
"""Convert float seconds to SRT time format (HH:MM:SS,mmm)"""
hours = int(sec // 3600)
minutes = int((sec % 3600) // 60)
seconds = int(sec % 60)
milliseconds = int((sec * 1000) % 1000)
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
with open(srt_path, "w") as srt:
for i, (start, end, text) in enumerate(new_segments, start=1):
srt.write(f"{i}\n")
srt.write(f"{srt_time(start)} --> {srt_time(end)}\n")
srt.write(text.strip() + "\n\n")
print(f"\n✅ Generated SRT file: {srt_path}\n")
# Create a clean .txt version of the SRT file
txt_path = os.path.join(output_dir, f"{base_name}.txt")
try:
with open(srt_path, "r") as srt_file, open(txt_path, "w") as txt_file:
for line in srt_file:
line = line.strip()
# Skip empty lines, lines with timestamps (containing '-->'), and lines starting with digits (subtitle numbers)
if line and not line.startswith(tuple('0123456789')) and '-->' not in line:
txt_file.write(line + "\n")
print(f"✅ Generated clean TXT file: {txt_path}")
except Exception as e:
print(f"❌ Error creating TXT file: {str(e)}")
# Delete the .tsv and .vtt files created in the same folder
base_path = os.path.join(output_dir, base_name)
tsv_path = f"{base_path}.tsv"
vtt_path = f"{base_path}.vtt"
if os.path.exists(tsv_path):
try:
os.remove(tsv_path)
# print(f"🗑️ Deleted TSV file: {tsv_path}")
except Exception as e:
print(f"❌ Error deleting TSV file: {str(e)}")
if os.path.exists(vtt_path):
try:
os.remove(vtt_path)
# print(f"🗑️ Deleted VTT file: {vtt_path}")
except Exception as e:
print(f"❌ Error deleting VTT file: {str(e)}")
return srt_path