Add local GPU preset voices with dropdown selection.

Generate ChatTTS sample_random_speaker presets without cloud APIs; choose clone or preset in synthesize UI.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
dekun
2026-06-12 17:28:17 +08:00
parent 8be34a2fd5
commit eb71e28427
7 changed files with 304 additions and 15 deletions
+33 -9
View File
@@ -24,6 +24,7 @@ from config import (
)
from llm_service import check_ollama_health, polish_text
from tts_service import generate_voice, save_fixed_speaker, speaker_is_ready
from voice_presets import label_to_voice_id, voice_choice_labels
from whisper_service import transcribe_audio
# ---------------------------------------------------------------------------
@@ -39,6 +40,12 @@ logging.basicConfig(
)
logger = logging.getLogger("trading_studio")
def _default_voice_label() -> str:
labels = voice_choice_labels()
return labels[0] if labels else "我的锁定音色(声音克隆)"
# ---------------------------------------------------------------------------
# 全局 UI 状态(Gradio State
# ---------------------------------------------------------------------------
@@ -117,12 +124,13 @@ def ui_check_ollama() -> str:
# ---------------------------------------------------------------------------
# 模块 4ChatTTS 音频合成
# ---------------------------------------------------------------------------
def ui_synthesize(polished_text: str) -> tuple[str | None, str]:
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str | None, str]:
"""【TTS 合成】生成最终 wav 配音文件。"""
if not polished_text or not polished_text.strip():
return None, "请先完成 Gemma4 润色。"
ok, msg, wav_path = generate_voice(polished_text)
voice_id = label_to_voice_id(voice_label)
ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id)
if ok and wav_path:
return wav_path, f"{msg}"
return None, f"{msg}"
@@ -135,6 +143,7 @@ def ui_full_pipeline(
audio_file,
skip_polish: bool,
manual_raw: str,
voice_label: str,
) -> tuple[str, str, str | None, str]:
"""
串联执行:识别 → 润色(可跳过)→ 合成。
@@ -168,7 +177,8 @@ def ui_full_pipeline(
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
# Step 3: 合成
ok, msg, wav_path = generate_voice(polished)
voice_id = label_to_voice_id(voice_label)
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
if not ok:
return raw, polished, None, f"❌ 合成失败: {msg}\n" + "\n".join(logs)
@@ -939,11 +949,16 @@ def build_app() -> gr.Blocks:
polish_log = gr.Textbox(label="润色日志", lines=2, interactive=False)
with gr.Column(scale=1):
gr.Markdown("### Step 3 · ChatTTS 配音合成")
gr.Markdown("### Step 3 · 本地 GPU 配音合成")
gr.Markdown(
"> 合成前会自动去掉 **Markdown**`#`、`**`)、emoji、"
"舞台提示(如前奏/转场)和文末「修改笔记」"
"也可手动删成纯口语文本再点合成。"
"> 全部在 **本机显卡** 运行,无需微软/讯飞 API。"
"可选「我的锁定音色」或预设男/女声;合成前会自动清洗 Markdown"
)
tts_voice = gr.Dropdown(
label="配音音色(本地 ChatTTS",
choices=voice_choice_labels(),
value=_default_voice_label(),
info="预设音色需先在服务器执行 bash scripts/generate_voice_presets.sh",
)
polished_text = gr.Textbox(
label="润色配音稿(可编辑,支持含 Markdown,合成时自动清洗)",
@@ -956,7 +971,11 @@ def build_app() -> gr.Blocks:
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
synth_btn.click(ui_synthesize, polished_text, [output_audio, synth_log])
synth_btn.click(
ui_synthesize,
[polished_text, tts_voice],
[output_audio, synth_log],
)
# ---- Tab 3: 一键生产 ----
with gr.Tab("🚀 一键生产"):
@@ -979,6 +998,11 @@ def build_app() -> gr.Blocks:
label="跳过 Gemma4 润色(仅测试 TTS)",
value=False,
)
pipe_voice = gr.Dropdown(
label="配音音色(本地 ChatTTS",
choices=voice_choice_labels(),
value=_default_voice_label(),
)
pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
with gr.Row(elem_classes=["pipeline-output-row"]):
@@ -988,7 +1012,7 @@ def build_app() -> gr.Blocks:
pipeline_btn.click(
ui_full_pipeline,
[pipe_audio, skip_polish_cb, pipe_manual],
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
[pipe_raw, pipe_polished, pipe_output, pipeline_log],
)