From e608c377602d24c7d63af779858b8c377dcc4803 Mon Sep 17 00:00:00 2001 From: Henry Paulino Date: Thu, 23 Apr 2026 13:51:43 +0100 Subject: [PATCH 1/3] fix(ios): move MLXArray copy off audio tap path in snapshot() Reading audioBuffer into a `let` only bumped the copy-on-write refcount, so the next append from the real-time audio tap forced a full duplication of the sample array under bufferLock. Take exclusive ownership of the buffer, release the lock, then construct MLXArray off the audio path. The defer block merges any samples appended during the copy back into audioBuffer so cumulative capture semantics are preserved across calls. --- package/ios/Sources/AudioCaptureManager.swift | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/package/ios/Sources/AudioCaptureManager.swift b/package/ios/Sources/AudioCaptureManager.swift index 9aa601b..9ed4532 100644 --- a/package/ios/Sources/AudioCaptureManager.swift +++ b/package/ios/Sources/AudioCaptureManager.swift @@ -88,10 +88,22 @@ class AudioCaptureManager { } func snapshot() -> MLXArray? { + // Take exclusive ownership of the accumulated buffer so the audio tap + // gets fresh empty storage to append into; the expensive MLXArray copy + // then happens off the audio path. Samples are merged back afterward + // so the buffer keeps accumulating across calls. bufferLock.lock() - let samples = audioBuffer + var samples = audioBuffer + audioBuffer.removeAll() bufferLock.unlock() + defer { + bufferLock.lock() + samples.append(contentsOf: audioBuffer) + audioBuffer = samples + bufferLock.unlock() + } + guard samples.count >= 16000 else { return nil } return MLXArray(samples) } From 507aaab3efb5ee4a6ced4898bd131ca5c7fa99c4 Mon Sep 17 00:00:00 2001 From: Henry Paulino Date: Thu, 23 Apr 2026 15:54:00 +0100 Subject: [PATCH 2/3] fix(stt): accumulate streaming transcripts and clear audio buffer per sn - Use snapshotAndClear() so each transcribeBuffer call processes only new audio - Append partial results in the example instead of overwriting, and merge with final text on stop --- example/app/(tabs)/stt.tsx | 7 ++++--- package/ios/Sources/HybridSTT.swift | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/example/app/(tabs)/stt.tsx b/example/app/(tabs)/stt.tsx index e9ebe2e..656355f 100644 --- a/example/app/(tabs)/stt.tsx +++ b/example/app/(tabs)/stt.tsx @@ -68,8 +68,8 @@ export default function STTScreen() { try { const text = await STT.transcribeBuffer() if (text) { - streamingRef.current = text - setStreamingText(text) + streamingRef.current = `${streamingRef.current} ${text}`.trim() + setStreamingText(streamingRef.current) } } catch { // buffer too small or not listening, skip @@ -88,7 +88,8 @@ export default function STTScreen() { stopPolling() setStatus('transcribing') const finalText = await STT.stopListening() - setTranscript(finalText || streamingRef.current) + const combined = `${streamingRef.current} ${finalText ?? ''}`.trim() + setTranscript(combined || streamingRef.current) setStreamingText('') streamingRef.current = '' setStatus('ready') diff --git a/package/ios/Sources/HybridSTT.swift b/package/ios/Sources/HybridSTT.swift index 6d550ac..e99ee19 100644 --- a/package/ios/Sources/HybridSTT.swift +++ b/package/ios/Sources/HybridSTT.swift @@ -132,7 +132,7 @@ class HybridSTT: HybridSTTSpec { guard let manager = captureManager, manager.isCapturing else { throw STTError.notListening } - guard let audio = manager.snapshot() else { + guard let audio = manager.snapshotAndClear() else { return Promise.resolved(withResult: "") } From 60d47e8e8a47dc2606f3a578d6e4b4f25bc409b1 Mon Sep 17 00:00:00 2001 From: Henry Paulino Date: Thu, 23 Apr 2026 21:50:53 +0100 Subject: [PATCH 3/3] commit message {"subject": "feat(stt): switch to Qwen3-ASR and gate silent audio chunks", "body": "- Add Qwen3-ASR 0.6B 4-bit model and swap HybridSTT to use it\n- Skip audio chunks below peak amplitude threshold to avoid ASR hallucinations on silence\n- Simplify example stop flow and surface transcribeBuffer errors"} --- example/app/(tabs)/stt.tsx | 23 ++++++++----------- package/ios/Sources/AudioCaptureManager.swift | 12 ++++++++++ package/ios/Sources/HybridSTT.swift | 12 +++++----- package/src/models.ts | 14 +++++++++++ 4 files changed, 42 insertions(+), 19 deletions(-) diff --git a/example/app/(tabs)/stt.tsx b/example/app/(tabs)/stt.tsx index 656355f..a716170 100644 --- a/example/app/(tabs)/stt.tsx +++ b/example/app/(tabs)/stt.tsx @@ -12,7 +12,7 @@ import { import { MLXModel, STT } from 'react-native-nitro-mlx' import { SafeAreaView } from 'react-native-safe-area-context' -const MODEL_ID = MLXModel.GLM_ASR_Nano_4bit +const MODEL_ID = MLXModel.Qwen3_ASR_0_6B_4bit type Status = 'idle' | 'loading' | 'ready' | 'listening' | 'transcribing' @@ -71,8 +71,8 @@ export default function STTScreen() { streamingRef.current = `${streamingRef.current} ${text}`.trim() setStreamingText(streamingRef.current) } - } catch { - // buffer too small or not listening, skip + } catch (error) { + console.warn('STT transcribeBuffer error:', error) } finally { isTranscribingChunk.current = false } @@ -84,19 +84,16 @@ export default function STTScreen() { const handleToggleListening = useCallback(async () => { if (status === 'listening') { + stopPolling() try { - stopPolling() - setStatus('transcribing') - const finalText = await STT.stopListening() - const combined = `${streamingRef.current} ${finalText ?? ''}`.trim() - setTranscript(combined || streamingRef.current) - setStreamingText('') - streamingRef.current = '' - setStatus('ready') + STT.stop() } catch (error) { - console.error('STT stopListening error:', error) - setStatus('ready') + console.error('STT stop error:', error) } + setTranscript(streamingRef.current) + setStreamingText('') + streamingRef.current = '' + setStatus('ready') } else if (status === 'ready') { setTranscript('') setStreamingText('') diff --git a/package/ios/Sources/AudioCaptureManager.swift b/package/ios/Sources/AudioCaptureManager.swift index 9ed4532..4775f57 100644 --- a/package/ios/Sources/AudioCaptureManager.swift +++ b/package/ios/Sources/AudioCaptureManager.swift @@ -84,6 +84,18 @@ class AudioCaptureManager { bufferLock.unlock() guard samples.count >= 8000 else { return nil } + + // Silence gate: skip chunks whose peak amplitude is near the noise + // floor so the ASR model doesn't hallucinate ("The.", "...") on + // silence. Peak-based because measurement-mode capture disables AGC, + // making RMS of quiet speech close to ambient noise. + var peak: Float = 0 + for s in samples { + let a = s < 0 ? -s : s + if a > peak { peak = a } + } + guard peak >= 0.005 else { return nil } + return MLXArray(samples) } diff --git a/package/ios/Sources/HybridSTT.swift b/package/ios/Sources/HybridSTT.swift index e99ee19..e5f35cb 100644 --- a/package/ios/Sources/HybridSTT.swift +++ b/package/ios/Sources/HybridSTT.swift @@ -11,7 +11,7 @@ enum STTError: Error { } class HybridSTT: HybridSTTSpec { - private var model: GLMASRModel? + private var model: Qwen3ASRModel? private var activeTask: Task? private var loadTask: Task? private var captureManager: AudioCaptureManager? @@ -39,7 +39,7 @@ class HybridSTT: HybridSTTSpec { self.model = nil MLX.Memory.clearCache() - let loadedModel = try await GLMASRModel.fromPretrained(modelId) + let loadedModel = try await Qwen3ASRModel.fromPretrained(modelId) try Task.checkCancellation() @@ -62,7 +62,7 @@ class HybridSTT: HybridSTTSpec { return Promise.async { [self] in let task = Task { let mlxAudio = self.arrayBufferToMLXArray(audio) - let output = model.generate(audio: mlxAudio) + let output = model.generate(audio: mlxAudio, language: "English") return output.text } @@ -84,7 +84,7 @@ class HybridSTT: HybridSTTSpec { return Promise.async { [self] in let task = Task { let mlxAudio = self.arrayBufferToMLXArray(audio) - let stream = model.generateStream(audio: mlxAudio) + let stream = model.generateStream(audio: mlxAudio, language: "English") var finalText = "" for try await event in stream { @@ -138,7 +138,7 @@ class HybridSTT: HybridSTTSpec { return Promise.async { [self] in let task = Task { - let output = model.generate(audio: audio) + let output = model.generate(audio: audio, language: "English") return output.text } @@ -164,7 +164,7 @@ class HybridSTT: HybridSTTSpec { return Promise.async { [self] in let task = Task { - let output = model.generate(audio: audio) + let output = model.generate(audio: audio, language: "English") return output.text } diff --git a/package/src/models.ts b/package/src/models.ts index 879e484..f9bcdb5 100644 --- a/package/src/models.ts +++ b/package/src/models.ts @@ -7,6 +7,7 @@ export enum ModelFamily { OpenELM = 'OpenELM', PocketTTS = 'PocketTTS', GLMASR = 'GLMASR', + Qwen3ASR = 'Qwen3ASR', } export enum ModelProvider { @@ -86,6 +87,9 @@ export enum MLXModel { // GLM-ASR (GLMASR) - Speech-to-Text GLM_ASR_Nano_4bit = 'mlx-community/GLM-ASR-Nano-2512-4bit', + + // Qwen3-ASR (Alibaba) - Speech-to-Text + Qwen3_ASR_0_6B_4bit = 'mlx-community/Qwen3-ASR-0.6B-4bit', } export const MLXModels: ModelInfo[] = [ @@ -389,4 +393,14 @@ export const MLXModels: ModelInfo[] = [ downloadSize: 600000000, type: 'stt', }, + { + id: MLXModel.Qwen3_ASR_0_6B_4bit, + family: ModelFamily.Qwen3ASR, + provider: ModelProvider.Alibaba, + parameters: '0.6B', + quantization: '4bit', + displayName: 'Qwen3 ASR 0.6B (4-bit)', + downloadSize: 712781278, + type: 'stt', + }, ]