From a2390711a3e50b789e78397818a71a2513e5c673 Mon Sep 17 00:00:00 2001
From: joao <thairahub@icloud.com>
Date: Thu, 11 Jun 2026 16:38:27 -0300
Subject: [PATCH] feat(captions): whisper-small + spoken-language selector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tiny with language autodetect was unreliable on non-English audio — a
misdetected language degrades the whole transcript. The fork's Electron
subtitle script (whisper-small, forced language) transcribes the same
clips accurately, so bring the in-app stack up to that configuration:

- bundle/load Xenova/whisper-small instead of whisper-tiny
- thread a Whisper language name from the auto-captions dialog through
  the worker to the transcriber ({ language, task: 'transcribe' }),
  skipping the detection pass; defaults to the app UI locale
- language selector (native-name labels) in the captions dialog

Installer grows by roughly the model-size delta (~200 MB). Dev fetches
the model from the HF CDN on first use as before.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 scripts/fetch-caption-model.mjs             |  4 +-
 src/components/video-editor/VideoEditor.tsx | 58 ++++++++++++++++++++-
 src/i18n/locales/ar/editor.json             |  3 +-
 src/i18n/locales/en/editor.json             |  3 +-
 src/i18n/locales/es/editor.json             |  3 +-
 src/i18n/locales/fr/editor.json             |  3 +-
 src/i18n/locales/it/editor.json             |  3 +-
 src/i18n/locales/ja-JP/editor.json          |  3 +-
 src/i18n/locales/ko-KR/editor.json          |  3 +-
 src/i18n/locales/pt-BR/editor.json          |  3 +-
 src/i18n/locales/ru/editor.json             |  3 +-
 src/i18n/locales/tr/editor.json             |  3 +-
 src/i18n/locales/vi/editor.json             |  3 +-
 src/i18n/locales/zh-CN/editor.json          |  3 +-
 src/i18n/locales/zh-TW/editor.json          |  3 +-
 src/lib/captioning/transcribe.ts            |  5 ++
 src/lib/captioning/transcribe.worker.ts     | 11 ++--
 src/lib/captioning/transcribeCore.ts        |  9 +++-
 18 files changed, 106 insertions(+), 20 deletions(-)
diff --git a/scripts/fetch-caption-model.mjs b/scripts/fetch-caption-model.mjs
index f1d0a1f50..7a9bc49ba 100644
--- a/scripts/fetch-caption-model.mjs
+++ b/scripts/fetch-caption-model.mjs
@@ -17,7 +17,9 @@ import { fileURLToPath } from "node:url";
 
 const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "..");
 const OUT = path.join(ROOT, "caption-assets");
-const MODEL_ID = "Xenova/whisper-tiny";
+// whisper-small: tiny's transcription quality (esp. with language autodetect on
+// non-English audio) was too unreliable to ship as the only captioning path.
+const MODEL_ID = "Xenova/whisper-small";
 const HF_BASE = `https://huggingface.co/${MODEL_ID}/resolve/main`;
 
 // Small config/tokenizer/preprocessor files plus the quantized ONNX the ASR pipeline loads by
diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx
index 2bb8557e0..984d5cdb5 100644
--- a/src/components/video-editor/VideoEditor.tsx
+++ b/src/components/video-editor/VideoEditor.tsx
@@ -198,6 +198,45 @@ function getVideoDurationMs(sourcePath: string): Promise<number> {
 
 const CAPTION_WORD_CHOICES = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] as const;
 
+// Values are Whisper language names (transformers.js); labels are native names so
+// they need no translation. Forcing the language skips Whisper's detection pass.
+const CAPTION_LANGUAGES = [
+	{ value: "english", label: "English" },
+	{ value: "portuguese", label: "Português" },
+	{ value: "spanish", label: "Español" },
+	{ value: "french", label: "Français" },
+	{ value: "italian", label: "Italiano" },
+	{ value: "german", label: "Deutsch" },
+	{ value: "japanese", label: "日本語" },
+	{ value: "korean", label: "한국어" },
+	{ value: "russian", label: "Русский" },
+	{ value: "turkish", label: "Türkçe" },
+	{ value: "vietnamese", label: "Tiếng Việt" },
+	{ value: "chinese", label: "中文" },
+	{ value: "arabic", label: "العربية" },
+	{ value: "hindi", label: "हिन्दी" },
+] as const;
+
+const LOCALE_TO_CAPTION_LANGUAGE: Record<string, string> = {
+	en: "english",
+	"pt-BR": "portuguese",
+	es: "spanish",
+	fr: "french",
+	it: "italian",
+	"ja-JP": "japanese",
+	"ko-KR": "korean",
+	ru: "russian",
+	tr: "turkish",
+	vi: "vietnamese",
+	"zh-CN": "chinese",
+	"zh-TW": "chinese",
+	ar: "arabic",
+};
+
+function captionLanguageForLocale(locale: string): string {
+	return LOCALE_TO_CAPTION_LANGUAGE[locale] ?? "english";
+}
+
 export default function VideoEditor() {
 	const {
 		state: editorState,
@@ -353,6 +392,7 @@ export default function VideoEditor() {
 	const effectiveShowCursor = showCursor && hasEditableCursorRecording;
 	const showCursorSettings = hasEditableCursorRecording;
 	const { locale, setLocale, t: rawT } = useI18n();
+	const [captionLanguage, setCaptionLanguage] = useState(() => captionLanguageForLocale(locale));
 	const t = useScopedT("editor");
 	const ts = useScopedT("settings");
 	const availableLocales = getAvailableLocales();
@@ -2606,6 +2646,7 @@ export default function VideoEditor() {
 				const trimRegionsForTranscribe = shiftTrimRegionsMsForCaptionBuffer(trimRegions, trimMs);
 
 				const transcribeOptions = {
+					language: captionLanguage,
 					onStatus: (phase: "model" | "transcribe") => {
 						if (phase === "model") {
 							toast.loading(t("autoCaptions.loadingModel"), {
@@ -2700,7 +2741,7 @@ export default function VideoEditor() {
 				setIsAutoCaptioning(false);
 			}
 		},
-		[videoPath, trimRegions, pushState, t],
+		[videoPath, trimRegions, pushState, t, captionLanguage],
 	);
 
 	const handleSaveDiagnostic = useCallback(async () => {
@@ -2780,6 +2821,21 @@ export default function VideoEditor() {
 						<DialogDescription>{t("autoCaptions.dialogDescription")}</DialogDescription>
 					</DialogHeader>
 					<div className="grid gap-4 py-2">
+						<div className="grid gap-2">
+							<Label htmlFor="caption-language">{t("autoCaptions.language")}</Label>
+							<Select value={captionLanguage} onValueChange={setCaptionLanguage}>
+								<SelectTrigger id="caption-language" className="h-9">
+									<SelectValue />
+								</SelectTrigger>
+								<SelectContent>
+									{CAPTION_LANGUAGES.map((lang) => (
+										<SelectItem key={lang.value} value={lang.value}>
+											{lang.label}
+										</SelectItem>
+									))}
+								</SelectContent>
+							</Select>
+						</div>
 						<div className="grid gap-2">
 							<Label htmlFor="caption-min-words">{t("autoCaptions.minWords")}</Label>
 							<Select
diff --git a/src/i18n/locales/ar/editor.json b/src/i18n/locales/ar/editor.json
index 859405d8f..4450f99e2 100644
--- a/src/i18n/locales/ar/editor.json
+++ b/src/i18n/locales/ar/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "لم يتم الكشف عن أي كلام.",
 		"noAudio": "لا يحتوي هذا الفيديو على صوت صالح للنسخ.",
 		"failed": "تعذّر توليد التسميات.",
-		"truncated": "تم نسخ الدقائق الأولى فقط: {{minutes}} دقيقة."
+		"truncated": "تم نسخ الدقائق الأولى فقط: {{minutes}} دقيقة.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "لا يوجد مشروع مفتوح",
diff --git a/src/i18n/locales/en/editor.json b/src/i18n/locales/en/editor.json
index 086588fb2..b772afdb9 100644
--- a/src/i18n/locales/en/editor.json
+++ b/src/i18n/locales/en/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "No speech was detected.",
 		"noAudio": "This video has no usable audio to transcribe.",
 		"failed": "Could not generate captions.",
-		"truncated": "Only the first {{minutes}} minutes were transcribed."
+		"truncated": "Only the first {{minutes}} minutes were transcribed.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "No project open",
diff --git a/src/i18n/locales/es/editor.json b/src/i18n/locales/es/editor.json
index 336abce22..cd6695d62 100644
--- a/src/i18n/locales/es/editor.json
+++ b/src/i18n/locales/es/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "No se detectó voz.",
 		"noAudio": "Este video no tiene audio utilizable para transcribir.",
 		"failed": "No se pudieron generar los subtítulos.",
-		"truncated": "Solo se transcribieron los primeros {{minutes}} minutos."
+		"truncated": "Solo se transcribieron los primeros {{minutes}} minutos.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "No hay proyecto abierto",
diff --git a/src/i18n/locales/fr/editor.json b/src/i18n/locales/fr/editor.json
index 227c32329..99e4f864d 100644
--- a/src/i18n/locales/fr/editor.json
+++ b/src/i18n/locales/fr/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "Aucune parole n'a été détectée.",
 		"noAudio": "Cette vidéo ne contient pas d'audio exploitable pour la transcription.",
 		"failed": "Impossible de générer les sous-titres.",
-		"truncated": "Seules les {{minutes}} premières minutes ont été transcrites."
+		"truncated": "Seules les {{minutes}} premières minutes ont été transcrites.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "Aucun projet ouvert",
diff --git a/src/i18n/locales/it/editor.json b/src/i18n/locales/it/editor.json
index c15afd4ba..66dda40db 100644
--- a/src/i18n/locales/it/editor.json
+++ b/src/i18n/locales/it/editor.json
@@ -60,7 +60,8 @@
 		"noneHeard": "Nessun parlato rilevato.",
 		"noAudio": "Questo video non contiene audio utilizzabile per la trascrizione.",
 		"failed": "Impossibile generare i sottotitoli.",
-		"truncated": "Sono stati trascritti solo i primi {{minutes}} minuti."
+		"truncated": "Sono stati trascritti solo i primi {{minutes}} minuti.",
+		"language": "Spoken language"
 	},
 	"loadingEditor": "Loading editor...",
 	"emptyState": {
diff --git a/src/i18n/locales/ja-JP/editor.json b/src/i18n/locales/ja-JP/editor.json
index 81b03caec..da5871c0b 100644
--- a/src/i18n/locales/ja-JP/editor.json
+++ b/src/i18n/locales/ja-JP/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "音声が検出されませんでした。",
 		"noAudio": "この動画には書き起こしに使える音声がありません。",
 		"failed": "キャプションを生成できませんでした。",
-		"truncated": "最初の {{minutes}} 分のみが書き起こされました。"
+		"truncated": "最初の {{minutes}} 分のみが書き起こされました。",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "プロジェクトが開かれていません",
diff --git a/src/i18n/locales/ko-KR/editor.json b/src/i18n/locales/ko-KR/editor.json
index c15b1872e..41271f18d 100644
--- a/src/i18n/locales/ko-KR/editor.json
+++ b/src/i18n/locales/ko-KR/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "음성이 감지되지 않았습니다.",
 		"noAudio": "이 동영상에는 전사에 사용할 수 있는 음성이 없습니다.",
 		"failed": "자막을 생성할 수 없습니다.",
-		"truncated": "처음 {{minutes}}분만 전사되었습니다."
+		"truncated": "처음 {{minutes}}분만 전사되었습니다.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "열린 프로젝트 없음",
diff --git a/src/i18n/locales/pt-BR/editor.json b/src/i18n/locales/pt-BR/editor.json
index d9121be5d..d5f11e48d 100644
--- a/src/i18n/locales/pt-BR/editor.json
+++ b/src/i18n/locales/pt-BR/editor.json
@@ -60,7 +60,8 @@
 		"noneHeard": "Nenhuma fala foi detectada.",
 		"noAudio": "Este vídeo não tem áudio utilizável para transcrição.",
 		"failed": "Não foi possível gerar as legendas.",
-		"truncated": "Apenas os primeiros {{minutes}} minutos foram transcritos."
+		"truncated": "Apenas os primeiros {{minutes}} minutos foram transcritos.",
+		"language": "Spoken language"
 	},
 	"loadingEditor": "Loading editor...",
 	"emptyState": {
diff --git a/src/i18n/locales/ru/editor.json b/src/i18n/locales/ru/editor.json
index 9351b1ab7..06ca125b6 100644
--- a/src/i18n/locales/ru/editor.json
+++ b/src/i18n/locales/ru/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "Речь не обнаружена.",
 		"noAudio": "В этом видео нет звука, пригодного для расшифровки.",
 		"failed": "Не удалось создать субтитры.",
-		"truncated": "Расшифрованы только первые {{minutes}} мин."
+		"truncated": "Расшифрованы только первые {{minutes}} мин.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "Нет открытых проектов",
diff --git a/src/i18n/locales/tr/editor.json b/src/i18n/locales/tr/editor.json
index f91f65d20..f94741186 100644
--- a/src/i18n/locales/tr/editor.json
+++ b/src/i18n/locales/tr/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "Konuşma algılanmadı.",
 		"noAudio": "Bu videoda yazıya dökülebilecek kullanılabilir bir ses yok.",
 		"failed": "Altyazılar oluşturulamadı.",
-		"truncated": "Yalnızca ilk {{minutes}} dakika yazıya döküldü."
+		"truncated": "Yalnızca ilk {{minutes}} dakika yazıya döküldü.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "Açık proje yok",
diff --git a/src/i18n/locales/vi/editor.json b/src/i18n/locales/vi/editor.json
index 8bad439b5..01c4c77bc 100644
--- a/src/i18n/locales/vi/editor.json
+++ b/src/i18n/locales/vi/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "Không phát hiện thấy lời nói.",
 		"noAudio": "Video này không có âm thanh dùng được để chuyển thành văn bản.",
 		"failed": "Không thể tạo phụ đề.",
-		"truncated": "Chỉ {{minutes}} phút đầu tiên được chuyển thành văn bản."
+		"truncated": "Chỉ {{minutes}} phút đầu tiên được chuyển thành văn bản.",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "Không có dự án nào được mở",
diff --git a/src/i18n/locales/zh-CN/editor.json b/src/i18n/locales/zh-CN/editor.json
index d6cf2764a..87186ae40 100644
--- a/src/i18n/locales/zh-CN/editor.json
+++ b/src/i18n/locales/zh-CN/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "未检测到语音。",
 		"noAudio": "此视频没有可用于转写的音频。",
 		"failed": "无法生成字幕。",
-		"truncated": "仅转写了最前 {{minutes}} 分钟。"
+		"truncated": "仅转写了最前 {{minutes}} 分钟。",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "未打开任何项目",
diff --git a/src/i18n/locales/zh-TW/editor.json b/src/i18n/locales/zh-TW/editor.json
index e7ddfd779..70cc78611 100644
--- a/src/i18n/locales/zh-TW/editor.json
+++ b/src/i18n/locales/zh-TW/editor.json
@@ -61,7 +61,8 @@
 		"noneHeard": "未偵測到語音。",
 		"noAudio": "此影片沒有可用於轉寫的音訊。",
 		"failed": "無法產生字幕。",
-		"truncated": "僅轉寫了最前 {{minutes}} 分鐘。"
+		"truncated": "僅轉寫了最前 {{minutes}} 分鐘。",
+		"language": "Spoken language"
 	},
 	"emptyState": {
 		"title": "未開啟任何專案",
diff --git a/src/lib/captioning/transcribe.ts b/src/lib/captioning/transcribe.ts
index a72a89673..28ddf2a68 100644
--- a/src/lib/captioning/transcribe.ts
+++ b/src/lib/captioning/transcribe.ts
@@ -26,6 +26,8 @@ export interface TranscribeWorkerRequest {
 	useLocalModels: boolean;
 	/** Base URL of bundled resources (packaged: resourcesPath file:// URL); used when `useLocalModels`. */
 	assetBaseUrl?: string;
+	/** Whisper language name (e.g. "portuguese"); skips autodetection when set. */
+	language?: string;
 }
 
 /** Messages the transcription worker posts back to the renderer. */
@@ -47,6 +49,8 @@ export function transcribeMono16kToSegments(
 		trimRegions?: TrimRegion[];
 		onStatus?: (phase: "model" | "transcribe") => void;
 		signal?: AbortSignal;
+		/** Whisper language name (e.g. "portuguese"); skips autodetection when set. */
+		language?: string;
 	},
 ): Promise<TranscribeMono16kResult> {
 	if (options?.signal?.aborted) {
@@ -100,6 +104,7 @@ export function transcribeMono16kToSegments(
 			trimRegions: options?.trimRegions ?? [],
 			useLocalModels,
 			assetBaseUrl,
+			language: options?.language,
 		};
 		worker.postMessage(request);
 	});
diff --git a/src/lib/captioning/transcribe.worker.ts b/src/lib/captioning/transcribe.worker.ts
index ab65b2eea..30ad795b3 100644
--- a/src/lib/captioning/transcribe.worker.ts
+++ b/src/lib/captioning/transcribe.worker.ts
@@ -63,18 +63,20 @@ async function loadTranscriber(opts: {
 			// Dev (http://localhost): fetch from the remote CDN, which works there.
 			env.allowLocalModels = false;
 		}
-		// Default tiny weights only: the `output_attentions` revision regresses inference in
-		// some environments (empty chunks, thrown errors) while phrase mode works on this model.
+		// Default weights only: the `output_attentions` revision regresses inference in
+		// some environments (empty chunks, thrown errors) while phrase mode works.
+		// whisper-small over tiny: tiny's accuracy (especially on non-English audio)
+		// was too unreliable; small matches the fork's proven extract-subtitles setup.
 		const transcriber = (await pipeline(
 			"automatic-speech-recognition",
-			"Xenova/whisper-tiny",
+			"Xenova/whisper-small",
 		)) as unknown as TranscriberFn;
 		return transcriber;
 	});
 }
 
 self.onmessage = async (event: MessageEvent<TranscribeWorkerRequest>) => {
-	const { samples, trimRegions, useLocalModels, assetBaseUrl } = event.data;
+	const { samples, trimRegions, useLocalModels, assetBaseUrl, language } = event.data;
 	try {
 		post({ type: "status", phase: "model" });
 		const transcriber = await loadTranscriber({ useLocalModels, assetBaseUrl });
@@ -84,6 +86,7 @@ self.onmessage = async (event: MessageEvent<TranscribeWorkerRequest>) => {
 			transcriber,
 			samples,
 			trimRegions ?? [],
+			language,
 		);
 
 		post({ type: "result", segments, granularity });
diff --git a/src/lib/captioning/transcribeCore.ts b/src/lib/captioning/transcribeCore.ts
index 9834e3654..89dfa2dda 100644
--- a/src/lib/captioning/transcribeCore.ts
+++ b/src/lib/captioning/transcribeCore.ts
@@ -127,15 +127,19 @@ function segmentsFromTranscriberChunks(
 async function runTranscriberOnSlice(
 	transcriber: TranscriberFn,
 	samples: Float32Array,
-	opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase" },
+	opts: { forceFullSequences: boolean; timestampMode: "word" | "phrase"; language?: string },
 ): Promise<unknown> {
 	const durationSec = samples.length / 16_000;
 	// Only chunk long clips; short-audio chunking regressed some Whisper.js runs (empty chunks).
 	const chunking = durationSec > 30 ? { chunk_length_s: 30, stride_length_s: 5 } : {};
+	// Forcing the language skips Whisper's detection pass, which is the dominant
+	// failure mode on non-English audio (a misdetect degrades the whole transcript).
+	const language = opts.language ? { language: opts.language, task: "transcribe" } : {};
 	return transcriber(samples, {
 		return_timestamps: opts.timestampMode === "word" ? "word" : true,
 		force_full_sequences: opts.forceFullSequences,
 		...chunking,
+		...language,
 	});
 }
 
@@ -185,6 +189,7 @@ export async function runTranscription(
 	transcriber: TranscriberFn,
 	samples: Float32Array,
 	trims: TrimRegion[],
+	language?: string,
 ): Promise<TranscribeMono16kResult> {
 	const transcribeOne = async (
 		ignoreTrims: boolean,
@@ -198,6 +203,7 @@ export async function runTranscription(
 				const result = await runTranscriberOnSlice(transcriber, slice, {
 					forceFullSequences,
 					timestampMode,
+					language,
 				});
 				return segmentsFromTranscriberChunks(
 					extractChunksFromAsrResult(result),
@@ -223,6 +229,7 @@ export async function runTranscription(
 				const result = await runTranscriberOnSlice(transcriber, slice, {
 					forceFullSequences,
 					timestampMode,
+					language,
 				});
 				const tOff = offset / 16_000;
 				all.push(