From eb71e2842794f87f55b930e8087c541e4d12fa00 Mon Sep 17 00:00:00 2001 From: dekun Date: Fri, 12 Jun 2026 17:28:17 +0800 Subject: [PATCH] Add local GPU preset voices with dropdown selection. Generate ChatTTS sample_random_speaker presets without cloud APIs; choose clone or preset in synthesize UI. Co-authored-by: Cursor --- README.md | 18 +++- app.py | 42 +++++++-- scripts/generate_voice_presets.py | 80 +++++++++++++++++ scripts/generate_voice_presets.sh | 16 ++++ tts_service.py | 14 ++- voice_presets.py | 139 ++++++++++++++++++++++++++++++ voices/manifest.json | 10 +++ 7 files changed, 304 insertions(+), 15 deletions(-) create mode 100644 scripts/generate_voice_presets.py create mode 100644 scripts/generate_voice_presets.sh create mode 100644 voice_presets.py create mode 100644 voices/manifest.json diff --git a/README.md b/README.md index 5cfbc4e..7455550 100644 --- a/README.md +++ b/README.md @@ -106,11 +106,25 @@ python app.py ## 使用流程 -### 首次使用:锁定音色 +### 配音音色(全部本地 GPU,无需 API) + +| 方式 | 说明 | +|------|------| +| **我的锁定音色** | 「音色锁定」上传你的人声 → 声音克隆(`speaker_emb.pt`) | +| **预设男/女声** | ChatTTS 内置说话人,合成页下拉选择(类似微软音色列表) | + +首次使用预设音色(服务器执行一次): + +```bash +bash scripts/generate_voice_presets.sh +pm2 restart trading_studio +``` + +### 首次使用:锁定音色(可选,用于克隆自己的声音) 1. 进入 **「音色锁定」** 标签页 2. 上传 10-30 秒干净人声参考(你的碎碎念盲录样本) -3. (可选)填写参考音频的精确转写,提升 zero-shot 还原度 +3. 填写参考音频的精确转写(强烈建议) 4. 点击 **锁定音色** → 生成 `speaker_emb.pt` ### 日常生产 diff --git a/app.py b/app.py index f7aad87..0c761bb 100644 --- a/app.py +++ b/app.py @@ -24,6 +24,7 @@ from config import ( ) from llm_service import check_ollama_health, polish_text from tts_service import generate_voice, save_fixed_speaker, speaker_is_ready +from voice_presets import label_to_voice_id, voice_choice_labels from whisper_service import transcribe_audio # --------------------------------------------------------------------------- @@ -39,6 +40,12 @@ logging.basicConfig( ) logger = logging.getLogger("trading_studio") + +def _default_voice_label() -> str: + labels = voice_choice_labels() + return labels[0] if labels else "我的锁定音色(声音克隆)" + + # --------------------------------------------------------------------------- # 全局 UI 状态(Gradio State) # --------------------------------------------------------------------------- @@ -117,12 +124,13 @@ def ui_check_ollama() -> str: # --------------------------------------------------------------------------- # 模块 4:ChatTTS 音频合成 # --------------------------------------------------------------------------- -def ui_synthesize(polished_text: str) -> tuple[str | None, str]: +def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str | None, str]: """【TTS 合成】生成最终 wav 配音文件。""" if not polished_text or not polished_text.strip(): return None, "请先完成 Gemma4 润色。" - ok, msg, wav_path = generate_voice(polished_text) + voice_id = label_to_voice_id(voice_label) + ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id) if ok and wav_path: return wav_path, f"✅ {msg}" return None, f"❌ {msg}" @@ -135,6 +143,7 @@ def ui_full_pipeline( audio_file, skip_polish: bool, manual_raw: str, + voice_label: str, ) -> tuple[str, str, str | None, str]: """ 串联执行:识别 → 润色(可跳过)→ 合成。 @@ -168,7 +177,8 @@ def ui_full_pipeline( logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。") # Step 3: 合成 - ok, msg, wav_path = generate_voice(polished) + voice_id = label_to_voice_id(voice_label) + ok, msg, wav_path = generate_voice(polished, voice_id=voice_id) if not ok: return raw, polished, None, f"❌ 合成失败: {msg}\n" + "\n".join(logs) @@ -939,11 +949,16 @@ def build_app() -> gr.Blocks: polish_log = gr.Textbox(label="润色日志", lines=2, interactive=False) with gr.Column(scale=1): - gr.Markdown("### Step 3 · ChatTTS 配音合成") + gr.Markdown("### Step 3 · 本地 GPU 配音合成") gr.Markdown( - "> 合成前会自动去掉 **Markdown**(`#`、`**`)、emoji、" - "舞台提示(如前奏/转场)和文末「修改笔记」。" - "也可手动删成纯口语文本再点合成。" + "> 全部在 **本机显卡** 运行,无需微软/讯飞 API。" + "可选「我的锁定音色」或预设男/女声;合成前会自动清洗 Markdown。" + ) + tts_voice = gr.Dropdown( + label="配音音色(本地 ChatTTS)", + choices=voice_choice_labels(), + value=_default_voice_label(), + info="预设音色需先在服务器执行 bash scripts/generate_voice_presets.sh", ) polished_text = gr.Textbox( label="润色配音稿(可编辑,支持含 Markdown,合成时自动清洗)", @@ -956,7 +971,11 @@ def build_app() -> gr.Blocks: transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log]) polish_btn.click(ui_polish, raw_text, [polished_text, polish_log]) - synth_btn.click(ui_synthesize, polished_text, [output_audio, synth_log]) + synth_btn.click( + ui_synthesize, + [polished_text, tts_voice], + [output_audio, synth_log], + ) # ---- Tab 3: 一键生产 ---- with gr.Tab("🚀 一键生产"): @@ -979,6 +998,11 @@ def build_app() -> gr.Blocks: label="跳过 Gemma4 润色(仅测试 TTS)", value=False, ) + pipe_voice = gr.Dropdown( + label="配音音色(本地 ChatTTS)", + choices=voice_choice_labels(), + value=_default_voice_label(), + ) pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg") pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False) with gr.Row(elem_classes=["pipeline-output-row"]): @@ -988,7 +1012,7 @@ def build_app() -> gr.Blocks: pipeline_btn.click( ui_full_pipeline, - [pipe_audio, skip_polish_cb, pipe_manual], + [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice], [pipe_raw, pipe_polished, pipe_output, pipeline_log], ) diff --git a/scripts/generate_voice_presets.py b/scripts/generate_voice_presets.py new file mode 100644 index 0000000..42ae8e1 --- /dev/null +++ b/scripts/generate_voice_presets.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +"""生成 ChatTTS 本地预设说话人(sample_random_speaker,走 GPU)。""" + +from __future__ import annotations + +import json +import sys +from datetime import datetime +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +import torch + +from config import CHATTTS_MODEL_DIR +from tts_service import get_chattts_instance, reset_chattts_instance +from voice_presets import ( + DEFAULT_MANIFEST, + MANIFEST_PATH, + PRESETS_DIR, + VOICES_DIR, + ensure_manifest, +) + + +def main() -> None: + ensure_manifest() + PRESETS_DIR.mkdir(parents=True, exist_ok=True) + + try: + from whisper_service import reset_whisper_model + + reset_whisper_model() + except Exception: + pass + + reset_chattts_instance() + chat, err = get_chattts_instance() + if chat is None: + raise SystemExit(f"ChatTTS 加载失败: {err}") + + if not hasattr(chat, "sample_random_speaker"): + raise SystemExit("当前 ChatTTS 版本不支持 sample_random_speaker") + + presets = DEFAULT_MANIFEST["presets"] + print(f"[INFO] 生成 {len(presets)} 个预设音色 → {PRESETS_DIR}") + + for item in presets: + pid = item["id"] + label = item["label"] + out_path = PRESETS_DIR / f"{pid}.pt" + + spk_emb = chat.sample_random_speaker() + payload = { + "version": 1, + "preset": True, + "id": pid, + "label": label, + "spk_emb": spk_emb, + "spk_smp": None, + "txt_smp": "", + "created_at": datetime.now().isoformat(), + "source": "ChatTTS.sample_random_speaker", + } + torch.save(payload, out_path) + print(f" [OK] {label} → {out_path.name}") + + manifest = json.loads(MANIFEST_PATH.read_text(encoding="utf-8")) + manifest["generated_at"] = datetime.now().isoformat() + manifest["chattts_model"] = str(CHATTTS_MODEL_DIR) + MANIFEST_PATH.write_text( + json.dumps(manifest, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + print("[OK] 全部预设音色生成完成") + + +if __name__ == "__main__": + main() diff --git a/scripts/generate_voice_presets.sh b/scripts/generate_voice_presets.sh new file mode 100644 index 0000000..ee4100c --- /dev/null +++ b/scripts/generate_voice_presets.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +# 生成本地 GPU 预设音色(ChatTTS 内置说话人,无需 API) +# 用法: bash scripts/generate_voice_presets.sh +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +VENV_PY="${ROOT}/venv/bin/python" + +if [[ ! -x "${VENV_PY}" ]]; then + echo "[ERROR] 未找到 venv,请先 bash deploy.sh deps" + exit 1 +fi + +echo "[INFO] 正在生成 ChatTTS 预设音色(本地 GPU)..." +"${VENV_PY}" "${ROOT}/scripts/generate_voice_presets.py" +echo "[OK] 预设音色已写入 ${ROOT}/voices/presets/" +echo "[OK] 在 Web UI「配音合成」处可从下拉框选择" diff --git a/tts_service.py b/tts_service.py index ed19c90..8a1d81c 100644 --- a/tts_service.py +++ b/tts_service.py @@ -632,12 +632,16 @@ def _concat_wavs( return np.concatenate(segments) -def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]: +def generate_voice( + refined_text: str, + voice_id: str = "custom", +) -> Tuple[bool, str, Optional[str]]: """ - 使用 ChatTTS 将润色后的文稿合成为 wav 配音。 + 使用 ChatTTS(本地 GPU)将润色稿合成为 wav。 Args: refined_text: LLM 润色后的配音稿 + voice_id: ``custom`` 为锁定音色,``preset_*`` 为内置预设(见 voice_presets) Returns: (success, message, output_wav_path_or_none) @@ -662,9 +666,11 @@ def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]: if chat is None: return False, init_err or "ChatTTS 不可用。", None - payload, spk_err = _load_speaker_payload() + from voice_presets import load_voice_payload + + payload, spk_err = load_voice_payload(voice_id) if payload is None: - return False, spk_err or "请先锁定音色。", None + return False, spk_err or "请先选择或生成可用音色。", None try: import ChatTTS diff --git a/voice_presets.py b/voice_presets.py new file mode 100644 index 0000000..d1067f8 --- /dev/null +++ b/voice_presets.py @@ -0,0 +1,139 @@ +""" +本地 GPU 音色库(ChatTTS,无需云端 API) +- custom:用户在「音色锁定」克隆的 speaker_emb.pt +- preset_*:ChatTTS sample_random_speaker 生成的内置说话人(scripts/generate_voice_presets.sh) +""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import torch + +from config import BASE_DIR, SPEAKER_EMB_PATH + +logger = logging.getLogger(__name__) + +VOICES_DIR = Path(BASE_DIR) / "voices" +PRESETS_DIR = VOICES_DIR / "presets" +MANIFEST_PATH = VOICES_DIR / "manifest.json" + +CUSTOM_VOICE_ID = "custom" + +# 生成脚本写入的预设元数据(.pt 文件不入 Git) +DEFAULT_MANIFEST = { + "presets": [ + {"id": "preset_01", "label": "预设·沉稳男声", "file": "presets/preset_01.pt"}, + {"id": "preset_02", "label": "预设·青年男声", "file": "presets/preset_02.pt"}, + {"id": "preset_03", "label": "预设·温柔女声", "file": "presets/preset_03.pt"}, + {"id": "preset_04", "label": "预设·活泼女声", "file": "presets/preset_04.pt"}, + {"id": "preset_05", "label": "预设·中性旁白", "file": "presets/preset_05.pt"}, + {"id": "preset_06", "label": "预设·纪录片风", "file": "presets/preset_06.pt"}, + ] +} + + +def ensure_manifest() -> None: + VOICES_DIR.mkdir(parents=True, exist_ok=True) + PRESETS_DIR.mkdir(parents=True, exist_ok=True) + if not MANIFEST_PATH.is_file(): + MANIFEST_PATH.write_text( + json.dumps(DEFAULT_MANIFEST, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + +def _read_manifest() -> Dict[str, Any]: + ensure_manifest() + try: + return json.loads(MANIFEST_PATH.read_text(encoding="utf-8")) + except Exception as exc: + logger.warning("读取 manifest 失败: %s", exc) + return DEFAULT_MANIFEST + + +def list_voice_choices() -> List[Tuple[str, str]]: + """ + 返回 Gradio Dropdown 选项:(显示名, voice_id)。 + 仅列出磁盘上已存在的音色。 + """ + choices: List[Tuple[str, str]] = [] + + if SPEAKER_EMB_PATH.is_file(): + choices.append(("我的锁定音色(声音克隆)", CUSTOM_VOICE_ID)) + + for preset in _read_manifest().get("presets", []): + pid = preset.get("id", "") + label = preset.get("label", pid) + rel = preset.get("file", "") + if pid and rel and (VOICES_DIR / rel).is_file(): + choices.append((label, pid)) + + if not choices: + choices.append( + ( + "(请先在「音色锁定」上传人声,或运行 generate_voice_presets.sh)", + CUSTOM_VOICE_ID, + ) + ) + return choices + + +def default_voice_id() -> str: + choices = list_voice_choices() + if not choices: + return CUSTOM_VOICE_ID + for _label, vid in choices: + if vid == CUSTOM_VOICE_ID: + return CUSTOM_VOICE_ID + return choices[0][1] + + +def voice_choice_labels() -> List[str]: + return [c[0] for c in list_voice_choices()] + + +def label_to_voice_id(label: str) -> str: + for lbl, vid in list_voice_choices(): + if lbl == label: + return vid + return default_voice_id() + + +def load_voice_payload(voice_id: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + """按 voice_id 加载 ChatTTS 说话人数据。""" + if voice_id == CUSTOM_VOICE_ID or not voice_id: + if not SPEAKER_EMB_PATH.is_file(): + return None, ( + "未找到锁定音色。请在「音色锁定」上传参考人声," + "或选择下方「预设」音色(需先运行 scripts/generate_voice_presets.sh)。" + ) + return _load_payload_file(SPEAKER_EMB_PATH) + + for preset in _read_manifest().get("presets", []): + if preset.get("id") != voice_id: + continue + path = VOICES_DIR / preset.get("file", "") + if not path.is_file(): + return None, ( + f"预设音色「{preset.get('label', voice_id)}」尚未生成。\n" + f"请在服务器执行: bash scripts/generate_voice_presets.sh" + ) + return _load_payload_file(path) + + return None, f"未知音色 ID: {voice_id}" + + +def _load_payload_file(path: Path) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: + try: + payload = torch.load(path, map_location="cpu", weights_only=False) + if isinstance(payload, torch.Tensor): + return {"spk_emb": payload, "spk_smp": None, "txt_smp": ""}, None + if isinstance(payload, dict): + return payload, None + return None, f"音色文件格式无效: {path.name}" + except Exception as exc: + return None, f"读取音色文件失败 ({path.name}): {exc}" diff --git a/voices/manifest.json b/voices/manifest.json new file mode 100644 index 0000000..204eb31 --- /dev/null +++ b/voices/manifest.json @@ -0,0 +1,10 @@ +{ + "presets": [ + {"id": "preset_01", "label": "预设·沉稳男声", "file": "presets/preset_01.pt"}, + {"id": "preset_02", "label": "预设·青年男声", "file": "presets/preset_02.pt"}, + {"id": "preset_03", "label": "预设·温柔女声", "file": "presets/preset_03.pt"}, + {"id": "preset_04", "label": "预设·活泼女声", "file": "presets/preset_04.pt"}, + {"id": "preset_05", "label": "预设·中性旁白", "file": "presets/preset_05.pt"}, + {"id": "preset_06", "label": "预设·纪录片风", "file": "presets/preset_06.pt"} + ] +}