Skip to content

paraformer-large的onnx无法转译出结果 #2830

@wen5280

Description

@wen5280

cpu环境,M2

同一个音频
在pytoch执行下是有文字输出,但是在onnx执行下就输出空
pytoch代码:

from funasr import AutoModel

paramformer_path="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
wavPath="test.wav"
paramformModel = AutoModel(
        model=paramformer_path,
        disable_update= False,
        model_revision="v2.0.4",
        device="cpu"
    )

paramformModel.generate(wavPath,
           language="auto",
           hotword="阿里巴巴",
           batch_size_s=300)

输出结果:res:[{'key': 'A132916050_1260_3300', 'text': '喂喂喂听到吗'}]

onnx代码:

from funasr_onnx import Paraformer

paramformer_path="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
paramformModel = Paraformer(paramformer_path, batch_size=1, quantize=False)
wavPath="test.wav"
# paramformModel([wavPath]) paramformModel(wavPath,hotword="")都测试过 
paramformModel(wavPath)

输出结果res:[]

导出onnx的代码:

from funasr import AutoModel

model_path = "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"

# 2. 核心:
model = AutoModel(
    model=model_path,
    device="cpu",
)

# 3. 导出
model.export(
    output_dir=model_path,
    type="onnx",
    quantize=False,
    opset=14
)

排除音频问题,onnx可正常初始化
Gemini给出的解决方案,但是无法真实应用

import os
import json
import numpy as np
import onnxruntime as ort
import librosa
import time
from numpy.lib.stride_tricks import as_strided


class HighPerformanceASR:
    def __init__(self, model_dir, threads=4):
        # 1. 路径设置
        self.onnx_path = os.path.join(model_dir, "model.onnx")
        self.tokens_path = os.path.join(model_dir, "tokens.json")
        self.am_mvn_path = os.path.join(model_dir, "am.mvn")

        # 2. 预加载词表(单次执行,不计入推理耗时)
        with open(self.tokens_path, "r", encoding="utf-8") as f:
            self.token_list = json.load(f)

        # 3. 预加载并解析 CMVN
        import pickle
        with open(self.am_mvn_path, "rb") as f:
            cmvn = pickle.load(f)
            self.means = cmvn[0].astype(np.float32)
            self.istd = cmvn[1].astype(np.float32)

        # 4. ONNX Runtime 性能调优
        sess_options = ort.SessionOptions()
        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        sess_options.intra_op_num_threads = threads  # 算子内并行线程数
        sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL

        # 针对 Mac 环境,如有必要可开启 CoreML 加速
        # providers = [('CoreMLExecutionProvider', {'MLComputeUnits': 'ALL'}), 'CPUExecutionProvider']
        self.session = ort.InferenceSession(self.onnx_path, sess_options=sess_options,
                                            providers=['CPUExecutionProvider'])
        self.input_names = [i.name for i in self.session.get_inputs()]

    def _apply_lfr_vectorized(self, fbank, m=7, n=6):
        """利用 NumPy 视图步进实现零拷贝拼接,速度提升 100x"""
        L, D = fbank.shape
        T = (L - m) // n + 1
        if T <= 0: return None

        # 创建一个步进视图,直接映射原始内存,避免 Python 循环拼接
        itemsize = fbank.itemsize
        feat = as_strided(
            fbank,
            shape=(T, m, D),
            strides=(fbank.strides[0] * n, fbank.strides[0], fbank.strides[1])
        )
        return feat.reshape(T, -1)  # 形状变为 [T, 560]

    def predict(self, wav_path):
        start_time = time.time()

        # A. 前处理 (Fbank)
        y, sr = librosa.load(wav_path, sr=16000)
        fbank = librosa.feature.melspectrogram(
            y=y, sr=sr, n_mels=80, n_fft=400, hop_length=160, win_length=400, center=False
        ).T
        fbank = np.log(fbank + 1e-6).astype(np.float32)

        # B. CMVN 归一化 (向量化加速)
        fbank = (fbank - self.means) * self.istd

        # C. LFR 拼帧 (向量化加速)
        feat = self._apply_lfr_vectorized(fbank)
        if feat is None: return "", 0

        feat = np.expand_dims(feat, axis=0)
        feat_len = np.array([feat.shape[1]], dtype=np.int32)

        # D. 补齐 bias_embed
        inputs = {"speech": feat, "speech_lengths": feat_len}
        if "bias_embed" in self.input_names:
            inputs["bias_embed"] = np.zeros((1, 1, 512), dtype=np.float32)

        # E. ONNX 推理 (这是最耗时的 C++ 算子部分)
        outputs = self.session.run(None, inputs)

        # F. 后处理解码
        token_ids = np.argmax(outputs[0], axis=-1)[0]
        res_text = "".join([self.token_list[tid].replace("@@", "") for tid in token_ids if tid > 2])

        total_time = time.time() - start_time
        return res_text, total_time

if __name__ == "__main__":
    MODEL_DIR = "iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
    WAV_FILE = "test.wav"

    # 第一次运行(预热)
    engine = HighPerformanceASR(MODEL_DIR, threads=4)
    _, _ = engine.predict(WAV_FILE)

    # 正式计时
    text, duration = engine.predict(WAV_FILE)

    print(f"识别结果: {text}")
    print(f"推理耗时: {duration:.4f} 秒")

Metadata

Metadata

Assignees

No one assigned

    Labels

    questionFurther information is requested

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions