Add local GPU preset voices with dropdown selection.

Generate ChatTTS sample_random_speaker presets without cloud APIs; choose clone or preset in synthesize UI. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-12 17:28:17 +08:00
parent 8be34a2fd5
commit eb71e28427
7 changed files with 304 additions and 15 deletions
@@ -24,6 +24,7 @@ from config import (
 )
 from llm_service import check_ollama_health, polish_text
 from tts_service import generate_voice, save_fixed_speaker, speaker_is_ready
+from voice_presets import label_to_voice_id, voice_choice_labels
 from whisper_service import transcribe_audio

 # ---------------------------------------------------------------------------
@@ -39,6 +40,12 @@ logging.basicConfig(
 )
 logger = logging.getLogger("trading_studio")

+
+def _default_voice_label() -> str:
+    labels = voice_choice_labels()
+    return labels[0] if labels else "我的锁定音色（声音克隆）"
+
+
 # ---------------------------------------------------------------------------
 # 全局 UI 状态（Gradio State）
 # ---------------------------------------------------------------------------
@@ -117,12 +124,13 @@ def ui_check_ollama() -> str:
 # ---------------------------------------------------------------------------
 # 模块 4：ChatTTS 音频合成
 # ---------------------------------------------------------------------------
-def ui_synthesize(polished_text: str) -> tuple[str | None, str]:
+def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str | None, str]:
    """【TTS 合成】生成最终 wav 配音文件。"""
    if not polished_text or not polished_text.strip():
        return None, "请先完成 Gemma4 润色。"

-    ok, msg, wav_path = generate_voice(polished_text)
+    voice_id = label_to_voice_id(voice_label)
+    ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id)
    if ok and wav_path:
        return wav_path, f"✅ {msg}"
    return None, f"❌ {msg}"
@@ -135,6 +143,7 @@ def ui_full_pipeline(
    audio_file,
    skip_polish: bool,
    manual_raw: str,
+    voice_label: str,
 ) -> tuple[str, str, str | None, str]:
    """
    串联执行：识别 → 润色（可跳过）→ 合成。
@@ -168,7 +177,8 @@ def ui_full_pipeline(
        logs.append(f"✅ Gemma4 润色完成（{len(polished)} 字）。")

    # Step 3: 合成
-    ok, msg, wav_path = generate_voice(polished)
+    voice_id = label_to_voice_id(voice_label)
+    ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
    if not ok:
        return raw, polished, None, f"❌ 合成失败: {msg}\n" + "\n".join(logs)

@@ -939,11 +949,16 @@ def build_app() -> gr.Blocks:
                        polish_log = gr.Textbox(label="润色日志", lines=2, interactive=False)

                    with gr.Column(scale=1):
-                        gr.Markdown("### Step 3 · ChatTTS 配音合成")
+                        gr.Markdown("### Step 3 · 本地 GPU 配音合成")
                        gr.Markdown(
-                            "> 合成前会自动去掉 **Markdown**（`#`、`**`）、emoji、"
-                            "舞台提示（如前奏/转场）和文末「修改笔记」。"
-                            "也可手动删成纯口语文本再点合成。"
+                            "> 全部在 **本机显卡** 运行，无需微软/讯飞 API。"
+                            "可选「我的锁定音色」或预设男/女声；合成前会自动清洗 Markdown。"
+                        )
+                        tts_voice = gr.Dropdown(
+                            label="配音音色（本地 ChatTTS）",
+                            choices=voice_choice_labels(),
+                            value=_default_voice_label(),
+                            info="预设音色需先在服务器执行 bash scripts/generate_voice_presets.sh",
                        )
                        polished_text = gr.Textbox(
                            label="润色配音稿（可编辑，支持含 Markdown，合成时自动清洗）",
@@ -956,7 +971,11 @@ def build_app() -> gr.Blocks:

                transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
                polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
-                synth_btn.click(ui_synthesize, polished_text, [output_audio, synth_log])
+                synth_btn.click(
+                    ui_synthesize,
+                    [polished_text, tts_voice],
+                    [output_audio, synth_log],
+                )

            # ---- Tab 3: 一键生产 ----
            with gr.Tab("🚀 一键生产"):
@@ -979,6 +998,11 @@ def build_app() -> gr.Blocks:
                    label="跳过 Gemma4 润色（仅测试 TTS）",
                    value=False,
                )
+                pipe_voice = gr.Dropdown(
+                    label="配音音色（本地 ChatTTS）",
+                    choices=voice_choice_labels(),
+                    value=_default_voice_label(),
+                )
                pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
                pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
                with gr.Row(elem_classes=["pipeline-output-row"]):
@@ -988,7 +1012,7 @@ def build_app() -> gr.Blocks:

                pipeline_btn.click(
                    ui_full_pipeline,
-                    [pipe_audio, skip_polish_cb, pipe_manual],
+                    [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
                    [pipe_raw, pipe_polished, pipe_output, pipeline_log],
                )