From fb21fd2f1a4259a1742fb72c122ba839a906e87f Mon Sep 17 00:00:00 2001 From: jasagiri Date: Sun, 5 Apr 2026 22:42:47 +0900 Subject: [PATCH 1/2] feat: add Apple Silicon (MPS) support for macOS ARM64 Introduce a device abstraction layer (cosyvoice/utils/device.py) that unifies CUDA, MPS, and CPU device management. Replace all hardcoded CUDA-specific code paths in the inference pipeline with device-agnostic alternatives, enabling CosyVoice to run natively on Apple Silicon Macs. Key changes: - Device abstraction: get_device(), get_stream_context(), get_autocast_context(), empty_cache() - model.py: Replace CUDA device init, streams, AMP, and cache clearing across CosyVoiceModel, CosyVoice2Model, CosyVoice3Model - cosyvoice.py: MPS-aware feature gates (TRT/vLLM require CUDA, JIT/fp16 require any GPU) - frontend.py: CoreMLExecutionProvider support for ONNX Runtime - common.py: Guard torch.cuda.manual_seed_all for non-CUDA environments - requirements.txt: Remove CUDA-only index URLs, loosen PyTorch version - setup_macos.sh: One-command setup script for Apple Silicon Co-Authored-By: Claude Opus 4.6 --- README.md | 26 +++++++++++++ cosyvoice/cli/cosyvoice.py | 32 ++++++++++----- cosyvoice/cli/frontend.py | 12 ++++-- cosyvoice/cli/model.py | 29 +++++++------- cosyvoice/utils/common.py | 3 +- cosyvoice/utils/device.py | 80 ++++++++++++++++++++++++++++++++++++++ requirements-cuda.txt | 5 +++ requirements.txt | 6 +-- setup_macos.sh | 51 ++++++++++++++++++++++++ 9 files changed, 211 insertions(+), 33 deletions(-) create mode 100644 cosyvoice/utils/device.py create mode 100644 requirements-cuda.txt create mode 100755 setup_macos.sh diff --git a/README.md b/README.md index 6308ac2cd..5f6173cb0 100644 --- a/README.md +++ b/README.md @@ -108,6 +108,32 @@ sudo yum install sox sox-devel ``` +### macOS Apple Silicon (M1/M2/M3/M4) + +For Apple Silicon Macs, use the dedicated setup script: + +``` sh +git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git +cd CosyVoice +bash setup_macos.sh +``` + +Or manually: + +``` sh +conda create -n cosyvoice -y python=3.10 +conda activate cosyvoice +conda install -c conda-forge pynini==2.1.5 -y +pip install torch torchaudio +pip install -r requirements.txt +``` + +**Apple Silicon notes:** +- Inference runs on MPS (Metal Performance Shaders) — faster than CPU +- TensorRT and vLLM are not available (CUDA-only) +- Training with DeepSpeed/DDP is not supported +- For CUDA environments (Linux), use `pip install -r requirements-cuda.txt` instead + ### Model download We strongly recommend that you download our pretrained `Fun-CosyVoice3-0.5B` `CosyVoice2-0.5B` `CosyVoice-300M` `CosyVoice-300M-SFT` `CosyVoice-300M-Instruct` model and `CosyVoice-ttsfrd` resource. diff --git a/cosyvoice/cli/cosyvoice.py b/cosyvoice/cli/cosyvoice.py index 7ab04a70f..03aa2bdc3 100644 --- a/cosyvoice/cli/cosyvoice.py +++ b/cosyvoice/cli/cosyvoice.py @@ -22,6 +22,7 @@ from cosyvoice.cli.model import CosyVoiceModel, CosyVoice2Model, CosyVoice3Model from cosyvoice.utils.file_utils import logging from cosyvoice.utils.class_utils import get_model_type +from cosyvoice.utils.device import is_cuda, is_gpu_available class CosyVoice: @@ -44,9 +45,12 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, fp16=False, trt_co '{}/spk2info.pt'.format(model_dir), configs['allowed_special']) self.sample_rate = configs['sample_rate'] - if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or fp16 is True): - load_jit, load_trt, fp16 = False, False, False - logging.warning('no cuda device, set load_jit/load_trt/fp16 to False') + if not is_cuda() and load_trt: + load_trt = False + logging.warning('TensorRT requires CUDA, disabling load_trt') + if not is_gpu_available() and (load_jit or fp16): + load_jit, fp16 = False, False + logging.warning('no GPU device, disabling load_jit/fp16') self.model = CosyVoiceModel(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), @@ -156,9 +160,16 @@ def __init__(self, model_dir, load_jit=False, load_trt=False, load_vllm=False, f '{}/spk2info.pt'.format(model_dir), configs['allowed_special']) self.sample_rate = configs['sample_rate'] - if torch.cuda.is_available() is False and (load_jit is True or load_trt is True or load_vllm is True or fp16 is True): - load_jit, load_trt, load_vllm, fp16 = False, False, False, False - logging.warning('no cuda device, set load_jit/load_trt/load_vllm/fp16 to False') + if not is_cuda(): + if load_trt: + load_trt = False + logging.warning('TensorRT requires CUDA, disabling load_trt') + if load_vllm: + load_vllm = False + logging.warning('vLLM requires CUDA, disabling load_vllm') + if not is_gpu_available() and (load_jit or fp16): + load_jit, fp16 = False, False + logging.warning('no GPU device, disabling load_jit/fp16') self.model = CosyVoice2Model(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), @@ -206,9 +217,12 @@ def __init__(self, model_dir, load_trt=False, load_vllm=False, fp16=False, trt_c '{}/spk2info.pt'.format(model_dir), configs['allowed_special']) self.sample_rate = configs['sample_rate'] - if torch.cuda.is_available() is False and (load_trt is True or fp16 is True): - load_trt, fp16 = False, False - logging.warning('no cuda device, set load_trt/fp16 to False') + if not is_cuda() and load_trt: + load_trt = False + logging.warning('TensorRT requires CUDA, disabling load_trt') + if not is_gpu_available() and fp16: + fp16 = False + logging.warning('no GPU device, disabling fp16') self.model = CosyVoice3Model(configs['llm'], configs['flow'], configs['hift'], fp16) self.model.load('{}/llm.pt'.format(model_dir), '{}/flow.pt'.format(model_dir), diff --git a/cosyvoice/cli/frontend.py b/cosyvoice/cli/frontend.py index 6d397cc99..f799c5055 100644 --- a/cosyvoice/cli/frontend.py +++ b/cosyvoice/cli/frontend.py @@ -25,6 +25,7 @@ import inflect from cosyvoice.utils.file_utils import logging, load_wav from cosyvoice.utils.frontend_utils import contains_chinese, replace_blank, replace_corner_mark, remove_bracket, spell_out_number, split_paragraph, is_only_punctuation +from cosyvoice.utils.device import get_device class CosyVoiceFrontEnd: @@ -38,14 +39,19 @@ def __init__(self, allowed_special: str = 'all'): self.tokenizer = get_tokenizer() self.feat_extractor = feat_extractor - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = get_device() option = onnxruntime.SessionOptions() option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL option.intra_op_num_threads = 1 self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"]) + if torch.cuda.is_available(): + tokenizer_providers = ["CUDAExecutionProvider"] + elif "CoreMLExecutionProvider" in onnxruntime.get_available_providers(): + tokenizer_providers = ["CoreMLExecutionProvider"] + else: + tokenizer_providers = ["CPUExecutionProvider"] self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option, - providers=["CUDAExecutionProvider" if torch.cuda.is_available() else - "CPUExecutionProvider"]) + providers=tokenizer_providers) if os.path.exists(spk2info): self.spk2info = torch.load(spk2info, map_location=self.device, weights_only=True) else: diff --git a/cosyvoice/cli/model.py b/cosyvoice/cli/model.py index 92a15d985..60086d4c0 100644 --- a/cosyvoice/cli/model.py +++ b/cosyvoice/cli/model.py @@ -24,6 +24,7 @@ from cosyvoice.utils.common import fade_in_out from cosyvoice.utils.file_utils import convert_onnx_to_trt, export_cosyvoice2_vllm from cosyvoice.utils.common import TrtContextWrapper +from cosyvoice.utils.device import get_device, get_stream_context, get_autocast_context, empty_cache class CosyVoiceModel: @@ -33,7 +34,7 @@ def __init__(self, flow: torch.nn.Module, hift: torch.nn.Module, fp16: bool = False): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = get_device() self.llm = llm self.flow = flow self.hift = hift @@ -52,7 +53,7 @@ def __init__(self, # rtf and decoding related self.stream_scale_factor = 1 assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf' - self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.llm_context = get_stream_context(self.device) self.lock = threading.Lock() # dict used to store session related variable self.tts_speech_token_dict = {} @@ -100,7 +101,7 @@ def get_trt_kwargs(self): def llm_job(self, text, prompt_text, llm_prompt_speech_token, llm_embedding, uuid): cur_silent_token_num, max_silent_token_num = 0, 5 - with self.llm_context, torch.cuda.amp.autocast(self.fp16 is True and hasattr(self.llm, 'vllm') is False): + with self.llm_context, get_autocast_context(self.fp16 is True and hasattr(self.llm, 'vllm') is False, self.device): if isinstance(text, Generator): assert (self.__class__.__name__ != 'CosyVoiceModel') and not hasattr(self.llm, 'vllm'), 'streaming input text is only implemented for CosyVoice2/3 and do not support vllm!' token_generator = self.llm.inference_bistream(text=text, @@ -133,7 +134,7 @@ def vc_job(self, source_speech_token, uuid): self.llm_end_dict[uuid] = True def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=False, speed=1.0): - with torch.cuda.amp.autocast(self.fp16): + with get_autocast_context(self.fp16, self.device): tts_mel, self.flow_cache_dict[uuid] = self.flow.inference(token=token.to(self.device, dtype=torch.int32), token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), prompt_token=prompt_token.to(self.device), @@ -237,9 +238,7 @@ def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.ze self.mel_overlap_dict.pop(this_uuid) self.hift_cache_dict.pop(this_uuid) self.flow_cache_dict.pop(this_uuid) - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.current_stream().synchronize() + empty_cache(self.device) class CosyVoice2Model(CosyVoiceModel): @@ -249,7 +248,7 @@ def __init__(self, flow: torch.nn.Module, hift: torch.nn.Module, fp16: bool = False): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = get_device() self.llm = llm self.flow = flow self.hift = hift @@ -266,7 +265,7 @@ def __init__(self, # speech fade in out self.speech_window = np.hamming(2 * self.source_cache_len) # rtf and decoding related - self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.llm_context = get_stream_context(self.device) self.lock = threading.Lock() # dict used to store session related variable self.tts_speech_token_dict = {} @@ -290,7 +289,7 @@ def load_vllm(self, model_dir): del self.llm.llm.model.model.layers def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0): - with torch.cuda.amp.autocast(self.fp16): + with get_autocast_context(self.fp16, self.device): tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32), token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), prompt_token=prompt_token.to(self.device), @@ -389,9 +388,7 @@ def tts(self, text=torch.zeros(1, 0, dtype=torch.int32), flow_embedding=torch.ze self.tts_speech_token_dict.pop(this_uuid) self.llm_end_dict.pop(this_uuid) self.hift_cache_dict.pop(this_uuid) - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.current_stream().synchronize() + empty_cache(self.device) class CosyVoice3Model(CosyVoice2Model): @@ -401,7 +398,7 @@ def __init__(self, flow: torch.nn.Module, hift: torch.nn.Module, fp16: bool = False): - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = get_device() self.llm = llm self.flow = flow self.hift = hift @@ -413,7 +410,7 @@ def __init__(self, self.stream_scale_factor = 2 assert self.stream_scale_factor >= 1, 'stream_scale_factor should be greater than 1, change it according to your actual rtf' # rtf and decoding related - self.llm_context = torch.cuda.stream(torch.cuda.Stream(self.device)) if torch.cuda.is_available() else nullcontext() + self.llm_context = get_stream_context(self.device) self.lock = threading.Lock() # dict used to store session related variable self.tts_speech_token_dict = {} @@ -423,7 +420,7 @@ def __init__(self, self.silent_tokens = [1, 2, 28, 29, 55, 248, 494, 2241, 2242, 2322, 2323] def token2wav(self, token, prompt_token, prompt_feat, embedding, token_offset, uuid, stream=False, finalize=False, speed=1.0): - with torch.cuda.amp.autocast(self.fp16): + with get_autocast_context(self.fp16, self.device): tts_mel, _ = self.flow.inference(token=token.to(self.device, dtype=torch.int32), token_len=torch.tensor([token.shape[1]], dtype=torch.int32).to(self.device), prompt_token=prompt_token.to(self.device), diff --git a/cosyvoice/utils/common.py b/cosyvoice/utils/common.py index 3f235a62e..7aaa3de6c 100644 --- a/cosyvoice/utils/common.py +++ b/cosyvoice/utils/common.py @@ -182,7 +182,8 @@ def set_all_random_seed(seed): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: diff --git a/cosyvoice/utils/device.py b/cosyvoice/utils/device.py new file mode 100644 index 000000000..13e76d75c --- /dev/null +++ b/cosyvoice/utils/device.py @@ -0,0 +1,80 @@ +# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unified device management for CUDA, MPS (Apple Silicon), and CPU backends.""" + +import random +from contextlib import nullcontext + +import numpy as np +import torch + + +def get_device() -> torch.device: + """Return the best available device: cuda > mps > cpu.""" + if torch.cuda.is_available(): + return torch.device('cuda') + if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + return torch.device('mps') + return torch.device('cpu') + + +def is_cuda() -> bool: + return torch.cuda.is_available() + + +def is_mps() -> bool: + return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() + + +def is_gpu_available() -> bool: + return is_cuda() or is_mps() + + +def get_stream_context(device: torch.device): + """Return a CUDA stream context or nullcontext for non-CUDA devices.""" + if device.type == 'cuda': + return torch.cuda.stream(torch.cuda.Stream(device)) + return nullcontext() + + +def get_autocast_context(enabled: bool, device: torch.device): + """Return the appropriate autocast context for the device.""" + if not enabled: + return nullcontext() + if device.type == 'cuda': + return torch.cuda.amp.autocast(enabled=True) + if device.type == 'mps': + return torch.autocast(device_type='mps', dtype=torch.float16) + return nullcontext() + + +def empty_cache(device: torch.device): + """Clear device cache and synchronize.""" + if device.type == 'cuda': + torch.cuda.empty_cache() + torch.cuda.current_stream().synchronize() + elif device.type == 'mps': + if hasattr(torch.mps, 'empty_cache'): + torch.mps.empty_cache() + if hasattr(torch.mps, 'synchronize'): + torch.mps.synchronize() + + +def set_all_random_seed(seed: int): + """Set random seed across all available backends.""" + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) diff --git a/requirements-cuda.txt b/requirements-cuda.txt new file mode 100644 index 000000000..102ad6f94 --- /dev/null +++ b/requirements-cuda.txt @@ -0,0 +1,5 @@ +# CUDA-specific requirements (Linux with NVIDIA GPU) +# Install with: pip install -r requirements-cuda.txt +--extra-index-url https://download.pytorch.org/whl/cu121 +--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ +-r requirements.txt diff --git a/requirements.txt b/requirements.txt index 989bccf61..b929b70fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ ---extra-index-url https://download.pytorch.org/whl/cu121 ---extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684 conformer==0.3.2 deepspeed==0.15.1; sys_platform == 'linux' diffusers==0.29.0 @@ -33,8 +31,8 @@ tensorboard==2.14.0 tensorrt-cu12==10.13.3.9; sys_platform == 'linux' tensorrt-cu12-bindings==10.13.3.9; sys_platform == 'linux' tensorrt-cu12-libs==10.13.3.9; sys_platform == 'linux' -torch==2.3.1 -torchaudio==2.3.1 +torch>=2.3.1 +torchaudio>=2.3.1 transformers==4.51.3 x-transformers==2.11.24 uvicorn==0.30.0 diff --git a/setup_macos.sh b/setup_macos.sh new file mode 100755 index 000000000..696323548 --- /dev/null +++ b/setup_macos.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Setup script for macOS Apple Silicon (M1/M2/M3/M4) +# Usage: bash setup_macos.sh [env_name] +set -euo pipefail + +echo "=== CosyVoice macOS Apple Silicon Setup ===" + +ARCH=$(uname -m) +if [ "$ARCH" != "arm64" ]; then + echo "Warning: This script is designed for Apple Silicon (arm64), detected: $ARCH" +fi + +if ! command -v conda &> /dev/null; then + echo "Error: conda not found. Please install Miniforge or Miniconda first." + echo " brew install miniforge" + exit 1 +fi + +ENV_NAME="${1:-cosyvoice}" + +echo "Creating conda environment: $ENV_NAME" +conda create -n "$ENV_NAME" python=3.10 -y +eval "$(conda shell.bash hook)" +conda activate "$ENV_NAME" + +echo "Installing pynini via conda-forge..." +conda install -c conda-forge pynini==2.1.5 -y + +echo "Installing PyTorch with MPS support..." +pip install torch torchaudio + +echo "Installing remaining dependencies..." +pip install -r requirements.txt + +if [ -f .gitmodules ]; then + echo "Initializing git submodules..." + git submodule update --init --recursive +fi + +echo "" +echo "=== Setup complete! ===" +echo "Activate with: conda activate $ENV_NAME" +echo "" +echo "Verify MPS device:" +echo " python -c \"import torch; print('MPS available:', torch.backends.mps.is_available())\"" +echo "" +echo "Notes:" +echo " - TensorRT is not available on Apple Silicon (CUDA-only)" +echo " - vLLM is not available on Apple Silicon (CUDA-only)" +echo " - Training (DeepSpeed/DDP) is not supported on MPS" +echo " - Inference uses MPS acceleration (faster than CPU)" From 0cb2c1e3334c177b62685e8dbef687b971b85687 Mon Sep 17 00:00:00 2001 From: jasagiri <172246+jasagiri@users.noreply.github.com> Date: Sat, 16 May 2026 17:25:03 +0900 Subject: [PATCH 2/2] fix(mps): run f0_predictor on CPU for CausalHiFTGenerator on Apple Silicon MPS does not support float64 (Apple Silicon hardware limitation), causing CausalHiFTGenerator.inference to fail on M-series Macs. Following the intent of the existing inline comment, move f0_predictor and its input to CPU for this precision-sensitive step, then bring the result back to the original device. Device move and dtype cast are done as two separate .to() calls: a combined .to(device, dtype) attempts the float64 cast while the tensor is still on MPS, which raises TypeError. This preserves the float64 precision the causal inference path requires, matching the CUDA behavior. Verified on Apple Silicon (MPS) with Fun-CosyVoice3-0.5B streaming zero-shot inference. Reported by @tedbeer in #1869. Co-Authored-By: Claude Opus 4.7 --- cosyvoice/hifigan/generator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cosyvoice/hifigan/generator.py b/cosyvoice/hifigan/generator.py index bbc2a2112..ad991039f 100644 --- a/cosyvoice/hifigan/generator.py +++ b/cosyvoice/hifigan/generator.py @@ -713,8 +713,15 @@ def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0), finali @torch.inference_mode() def inference(self, speech_feat: torch.Tensor, finalize: bool = True) -> torch.Tensor: # mel->f0 NOTE f0_predictor precision is crucial for causal inference, move self.f0_predictor to cpu if necessary - self.f0_predictor.to(torch.float64) - f0 = self.f0_predictor(speech_feat.to(torch.float64), finalize=finalize).to(speech_feat) + # MPS cannot do float64 (Apple Silicon hardware limitation), so run f0_predictor on CPU and move the result back. + # Device move and dtype cast must be two separate steps: a combined .to(device, dtype) attempts the + # float64 cast on the still-MPS tensor before the device move, which MPS rejects. + if speech_feat.device.type == "mps": + self.f0_predictor.to("cpu").to(torch.float64) + f0 = self.f0_predictor(speech_feat.to("cpu").to(torch.float64), finalize=finalize).to(speech_feat) + else: + self.f0_predictor.to(torch.float64) + f0 = self.f0_predictor(speech_feat.to(torch.float64), finalize=finalize).to(speech_feat) # f0->source s = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t s, _, _ = self.m_source(s)