From a2390711a3e50b789e78397818a71a2513e5c673 Mon Sep 17 00:00:00 2001 From: joao Date: Thu, 11 Jun 2026 16:38:27 -0300 Subject: [PATCH] feat(captions): whisper-small + spoken-language selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tiny with language autodetect was unreliable on non-English audio — a misdetected language degrades the whole transcript. The fork's Electron subtitle script (whisper-small, forced language) transcribes the same clips accurately, so bring the in-app stack up to that configuration: - bundle/load Xenova/whisper-small instead of whisper-tiny - thread a Whisper language name from the auto-captions dialog through the worker to the transcriber ({ language, task: 'transcribe' }), skipping the detection pass; defaults to the app UI locale - language selector (native-name labels) in the captions dialog Installer grows by roughly the model-size delta (~200 MB). Dev fetches the model from the HF CDN on first use as before. Co-Authored-By: Claude Fable 5 --- scripts/fetch-caption-model.mjs | 4 +- src/components/video-editor/VideoEditor.tsx | 58 ++++++++++++++++++++- src/i18n/locales/ar/editor.json | 3 +- src/i18n/locales/en/editor.json | 3 +- src/i18n/locales/es/editor.json | 3 +- src/i18n/locales/fr/editor.json | 3 +- src/i18n/locales/it/editor.json | 3 +- src/i18n/locales/ja-JP/editor.json | 3 +- src/i18n/locales/ko-KR/editor.json | 3 +- src/i18n/locales/pt-BR/editor.json | 3 +- src/i18n/locales/ru/editor.json | 3 +- src/i18n/locales/tr/editor.json | 3 +- src/i18n/locales/vi/editor.json | 3 +- src/i18n/locales/zh-CN/editor.json | 3 +- src/i18n/locales/zh-TW/editor.json | 3 +- src/lib/captioning/transcribe.ts | 5 ++ src/lib/captioning/transcribe.worker.ts | 11 ++-- src/lib/captioning/transcribeCore.ts | 9 +++- 18 files changed, 106 insertions(+), 20 deletions(-) diff --git a/scripts/fetch-caption-model.mjs b/scripts/fetch-caption-model.mjs index f1d0a1f50..7a9bc49ba 100644 --- a/scripts/fetch-caption-model.mjs +++ b/scripts/fetch-caption-model.mjs @@ -17,7 +17,9 @@ import { fileURLToPath } from "node:url"; const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), ".."); const OUT = path.join(ROOT, "caption-assets"); -const MODEL_ID = "Xenova/whisper-tiny"; +// whisper-small: tiny's transcription quality (esp. with language autodetect on +// non-English audio) was too unreliable to ship as the only captioning path. +const MODEL_ID = "Xenova/whisper-small"; const HF_BASE = `https://huggingface.co/${MODEL_ID}/resolve/main`; // Small config/tokenizer/preprocessor files plus the quantized ONNX the ASR pipeline loads by diff --git a/src/components/video-editor/VideoEditor.tsx b/src/components/video-editor/VideoEditor.tsx index 2bb8557e0..984d5cdb5 100644 --- a/src/components/video-editor/VideoEditor.tsx +++ b/src/components/video-editor/VideoEditor.tsx @@ -198,6 +198,45 @@ function getVideoDurationMs(sourcePath: string): Promise { const CAPTION_WORD_CHOICES = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] as const; +// Values are Whisper language names (transformers.js); labels are native names so +// they need no translation. Forcing the language skips Whisper's detection pass. +const CAPTION_LANGUAGES = [ + { value: "english", label: "English" }, + { value: "portuguese", label: "Português" }, + { value: "spanish", label: "Español" }, + { value: "french", label: "Français" }, + { value: "italian", label: "Italiano" }, + { value: "german", label: "Deutsch" }, + { value: "japanese", label: "日本語" }, + { value: "korean", label: "한국어" }, + { value: "russian", label: "Русский" }, + { value: "turkish", label: "Türkçe" }, + { value: "vietnamese", label: "Tiếng Việt" }, + { value: "chinese", label: "中文" }, + { value: "arabic", label: "العربية" }, + { value: "hindi", label: "हिन्दी" }, +] as const; + +const LOCALE_TO_CAPTION_LANGUAGE: Record = { + en: "english", + "pt-BR": "portuguese", + es: "spanish", + fr: "french", + it: "italian", + "ja-JP": "japanese", + "ko-KR": "korean", + ru: "russian", + tr: "turkish", + vi: "vietnamese", + "zh-CN": "chinese", + "zh-TW": "chinese", + ar: "arabic", +}; + +function captionLanguageForLocale(locale: string): string { + return LOCALE_TO_CAPTION_LANGUAGE[locale] ?? "english"; +} + export default function VideoEditor() { const { state: editorState, @@ -353,6 +392,7 @@ export default function VideoEditor() { const effectiveShowCursor = showCursor && hasEditableCursorRecording; const showCursorSettings = hasEditableCursorRecording; const { locale, setLocale, t: rawT } = useI18n(); + const [captionLanguage, setCaptionLanguage] = useState(() => captionLanguageForLocale(locale)); const t = useScopedT("editor"); const ts = useScopedT("settings"); const availableLocales = getAvailableLocales(); @@ -2606,6 +2646,7 @@ export default function VideoEditor() { const trimRegionsForTranscribe = shiftTrimRegionsMsForCaptionBuffer(trimRegions, trimMs); const transcribeOptions = { + language: captionLanguage, onStatus: (phase: "model" | "transcribe") => { if (phase === "model") { toast.loading(t("autoCaptions.loadingModel"), { @@ -2700,7 +2741,7 @@ export default function VideoEditor() { setIsAutoCaptioning(false); } }, - [videoPath, trimRegions, pushState, t], + [videoPath, trimRegions, pushState, t, captionLanguage], ); const handleSaveDiagnostic = useCallback(async () => { @@ -2780,6 +2821,21 @@ export default function VideoEditor() { {t("autoCaptions.dialogDescription")}
+
+ + +