From 39e29fe6a909fb61c4e1f2833546fed3310a7b87 Mon Sep 17 00:00:00 2001 From: dekun Date: Fri, 12 Jun 2026 16:05:55 +0800 Subject: [PATCH] Load mobile audio via ffmpeg to avoid librosa PySoundFile warnings. Co-authored-by: Cursor --- tts_service.py | 69 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/tts_service.py b/tts_service.py index bef1654..df84e8e 100644 --- a/tts_service.py +++ b/tts_service.py @@ -10,6 +10,7 @@ import logging import os import traceback import uuid +import warnings from datetime import datetime from pathlib import Path from typing import Any, Dict, Optional, Tuple @@ -179,29 +180,81 @@ def get_chattts_instance(): return None, _chat_error +def _load_audio_via_ffmpeg(audio_path: str, sample_rate: int) -> np.ndarray: + """通过 ffmpeg 转码为 wav 再读取,兼容手机 webm/m4a 等格式。""" + import subprocess + import tempfile + + import soundfile as sf + + tmp_path = tempfile.mktemp(suffix=".wav") + try: + cmd = [ + "ffmpeg", + "-y", + "-i", + audio_path, + "-ac", + "1", + "-ar", + str(sample_rate), + "-f", + "wav", + tmp_path, + ] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) + if result.returncode != 0: + raise RuntimeError(result.stderr[-500:] if result.stderr else "ffmpeg 转码失败") + + audio, _ = sf.read(tmp_path, dtype="float32", always_2d=False) + if isinstance(audio, np.ndarray) and audio.ndim > 1: + audio = audio.mean(axis=1) + return np.asarray(audio, dtype=np.float32) + finally: + Path(tmp_path).unlink(missing_ok=True) + + def _load_audio_for_chattts(audio_path: str, sample_rate: int = TTS_SAMPLE_RATE) -> np.ndarray: """ 加载音频并重采样到 ChatTTS 所需采样率。 - 优先使用 ChatTTS 自带工具,回退到 librosa。 + 优先 ChatTTS 工具 → ffmpeg 转码 → librosa 兜底。 """ + errors: list[str] = [] + try: from ChatTTS.utils import load_audio return load_audio(audio_path, sample_rate) - except ImportError: - pass + except Exception as exc: + errors.append(f"ChatTTS.utils: {exc}") try: from tools.audio import load_audio return load_audio(audio_path, sample_rate) - except ImportError: - pass + except Exception as exc: + errors.append(f"tools.audio: {exc}") - import librosa + try: + return _load_audio_via_ffmpeg(audio_path, sample_rate) + except Exception as exc: + errors.append(f"ffmpeg: {exc}") - audio, _ = librosa.load(audio_path, sr=sample_rate, mono=True) - return audio + try: + import librosa + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning) + warnings.filterwarnings("ignore", message="PySoundFile failed") + audio, _ = librosa.load(audio_path, sr=sample_rate, mono=True) + return audio + except Exception as exc: + errors.append(f"librosa: {exc}") + + raise RuntimeError( + "无法读取音频文件,请上传 wav/mp3/m4a 或确认已安装 ffmpeg。\n" + + "\n".join(errors[-3:]) + ) def _get_audio_duration_sec(audio: np.ndarray, sample_rate: int) -> float: