From 39e29fe6a909fb61c4e1f2833546fed3310a7b87 Mon Sep 17 00:00:00 2001
From: dekun <dekun@local>
Date: Fri, 12 Jun 2026 16:05:55 +0800
Subject: [PATCH] Load mobile audio via ffmpeg to avoid librosa PySoundFile
 warnings.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 tts_service.py | 69 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/tts_service.py b/tts_service.py
index bef1654..df84e8e 100644
--- a/tts_service.py
+++ b/tts_service.py
@@ -10,6 +10,7 @@ import logging
 import os
 import traceback
 import uuid
+import warnings
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
@@ -179,29 +180,81 @@ def get_chattts_instance():
         return None, _chat_error
 
 
+def _load_audio_via_ffmpeg(audio_path: str, sample_rate: int) -> np.ndarray:
+    """通过 ffmpeg 转码为 wav 再读取，兼容手机 webm/m4a 等格式。"""
+    import subprocess
+    import tempfile
+
+    import soundfile as sf
+
+    tmp_path = tempfile.mktemp(suffix=".wav")
+    try:
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-i",
+            audio_path,
+            "-ac",
+            "1",
+            "-ar",
+            str(sample_rate),
+            "-f",
+            "wav",
+            tmp_path,
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+        if result.returncode != 0:
+            raise RuntimeError(result.stderr[-500:] if result.stderr else "ffmpeg 转码失败")
+
+        audio, _ = sf.read(tmp_path, dtype="float32", always_2d=False)
+        if isinstance(audio, np.ndarray) and audio.ndim > 1:
+            audio = audio.mean(axis=1)
+        return np.asarray(audio, dtype=np.float32)
+    finally:
+        Path(tmp_path).unlink(missing_ok=True)
+
+
 def _load_audio_for_chattts(audio_path: str, sample_rate: int = TTS_SAMPLE_RATE) -> np.ndarray:
     """
     加载音频并重采样到 ChatTTS 所需采样率。
-    优先使用 ChatTTS 自带工具，回退到 librosa。
+    优先 ChatTTS 工具 → ffmpeg 转码 → librosa 兜底。
     """
+    errors: list[str] = []
+
     try:
         from ChatTTS.utils import load_audio
 
         return load_audio(audio_path, sample_rate)
-    except ImportError:
-        pass
+    except Exception as exc:
+        errors.append(f"ChatTTS.utils: {exc}")
 
     try:
         from tools.audio import load_audio
 
         return load_audio(audio_path, sample_rate)
-    except ImportError:
-        pass
+    except Exception as exc:
+        errors.append(f"tools.audio: {exc}")
 
-    import librosa
+    try:
+        return _load_audio_via_ffmpeg(audio_path, sample_rate)
+    except Exception as exc:
+        errors.append(f"ffmpeg: {exc}")
 
-    audio, _ = librosa.load(audio_path, sr=sample_rate, mono=True)
-    return audio
+    try:
+        import librosa
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=FutureWarning)
+            warnings.filterwarnings("ignore", message="PySoundFile failed")
+            audio, _ = librosa.load(audio_path, sr=sample_rate, mono=True)
+        return audio
+    except Exception as exc:
+        errors.append(f"librosa: {exc}")
+
+    raise RuntimeError(
+        "无法读取音频文件，请上传 wav/mp3/m4a 或确认已安装 ffmpeg。\n"
+        + "\n".join(errors[-3:])
+    )
 
 
 def _get_audio_duration_sec(audio: np.ndarray, sample_rate: int) -> float: