Add voice history, default preset voice, and one-click tab

Keep synthesized wav files browsable with playback and download, default to preset steady male voice, show one-click pipeline as the first tab, and reduce post-synthesis UI flicker. Co-authored-by: Cursor <cursoragent@cursor.com>
2026-06-12 18:37:53 +08:00
parent 7c50b13c57
commit bdc63c04df
4 changed files with 269 additions and 119 deletions
@@ -25,7 +25,8 @@ from config import (
 )
 from llm_service import check_ollama_health, polish_text
 from tts_service import generate_voice, save_fixed_speaker, speaker_is_ready
-from voice_presets import label_to_voice_id, voice_choice_labels
+from voice_history import list_voice_history
 from voice_presets import default_voice_label, label_to_voice_id, voice_choice_labels
 from whisper_service import transcribe_audio
 # ---------------------------------------------------------------------------
@@ -42,9 +43,35 @@ logging.basicConfig(
 logger = logging.getLogger("trading_studio")
-def _default_voice_label() -> str:
+# ---------------------------------------------------------------------------
-    labels = voice_choice_labels()
+# 配音历史
-    return labels[0] if labels else "我的锁定音色（声音克隆）"
+# ---------------------------------------------------------------------------
 def ui_history_dropdown(select_path: str | None = None) -> dict:
    """刷新历史下拉列表；可选选中指定路径（合成完成后传入新文件）。"""
    choices = list_voice_history()
    paths = [p for _, p in choices]
    if select_path and select_path in paths:
        value = select_path
    elif paths:
        value = paths[0]
    else:
        value = None
    return gr.update(choices=choices, value=value)
 def ui_history_play(filepath: str | None) -> dict:
    """选中历史条目后加载播放器。"""
    if filepath and Path(filepath).is_file():
        return gr.update(value=filepath)
    return gr.update(value=None)
 def ui_initial_history() -> tuple[dict, dict]:
    """首屏加载历史列表并自动选中最新一条。"""
    choices = list_voice_history()
    paths = [p for _, p in choices]
    latest = paths[0] if paths else None
    return gr.update(choices=choices, value=latest), ui_history_play(latest)
 # ---------------------------------------------------------------------------
@@ -141,40 +168,42 @@ def _short_synth_log(msg: str, ok: bool) -> str:
    return "✅ 配音完成。请用下方播放器试听、下载。"
-def ui_synth_pending(polished_text: str) -> tuple[str, dict]:
+def ui_synth_pending(polished_text: str) -> str:
-    """点击合成后立即反馈，避免长时间无日志更新被误认为卡死。"""
+    """点击合成后立即更新日志；不触碰播放器，避免波形组件销毁重建导致闪屏。"""
    text = (polished_text or "").strip()
    if not text:
-        return "请先完成 Gemma4 润色。", gr.update(value=None)
+        return "请先完成 Gemma4 润色。"
    est_sec = max(20, len(text) // 10)
    return (
-        f"⏳ 配音合成中（约 {len(text)} 字，预计 {est_sec}–{est_sec + 45} 秒），请勿重复点击…",
+        f"⏳ 配音合成中（约 {len(text)} 字，预计 {est_sec}–{est_sec + 45} 秒），请勿重复点击或刷新页面…"
    )
 def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]:
    """【TTS 合成】生成最终 wav 配音文件。"""
    if not polished_text or not polished_text.strip():
        return (
            "请先完成 Gemma4 润色。",
            gr.update(value=None),
            ui_history_dropdown(),
            gr.update(value=None),
        )
 def ui_synthesize(
    polished_text: str,
    voice_label: str,
    progress: gr.Progress = gr.Progress(),
 ) -> tuple[str, dict]:
    """【TTS 合成】生成最终 wav 配音文件。"""
    if not polished_text or not polished_text.strip():
        return "请先完成 Gemma4 润色。", gr.update(value=None)
    voice_id = label_to_voice_id(voice_label)
-
+    ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id)
    def _report_segment(seg: int, total: int) -> None:
        progress((seg - 1) / max(total, 1), desc=f"ChatTTS 第 {seg}/{total} 段…")
    ok, msg, wav_path = generate_voice(
        polished_text,
        voice_id=voice_id,
        progress_callback=_report_segment,
    )
    if ok:
-        return _short_synth_log(msg, ok), gr.update(value=wav_path)
+        return (
-    return _short_synth_log(msg, ok), gr.update(value=None)
+            _short_synth_log(msg, ok),
            gr.update(value=wav_path),
            ui_history_dropdown(wav_path),
            gr.update(value=wav_path),
        )
    return (
        _short_synth_log(msg, ok),
        gr.update(value=None),
        ui_history_dropdown(),
        gr.update(value=None),
    )
 # ---------------------------------------------------------------------------
@@ -185,7 +214,7 @@ def ui_full_pipeline(
    skip_polish: bool,
    manual_raw: str,
    voice_label: str,
-) -> tuple[str, str, str | None, str]:
+) -> tuple[str, str, dict, str, dict, dict]:
    """
    串联执行：识别 → 润色（可跳过）→ 合成。
    返回 (raw, polished, wav_path, log)
@@ -199,10 +228,10 @@ def ui_full_pipeline(
    else:
        path = _save_upload(audio_file)
        if not path:
-            return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。"
+            return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None)
        ok, result = transcribe_audio(path)
        if not ok:
-            return "", "", gr.update(value=None), f"❌ 识别失败: {result}"
+            return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None)
        raw = result
        logs.append(f"✅ Whisper 识别完成（{len(raw)} 字）。")
@@ -213,7 +242,7 @@ def ui_full_pipeline(
    else:
        ok, result = polish_text(raw)
        if not ok:
-            return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs)
+            return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
        polished = result
        logs.append(f"✅ Gemma4 润色完成（{len(polished)} 字）。")
@@ -221,10 +250,17 @@ def ui_full_pipeline(
    voice_id = label_to_voice_id(voice_label)
    ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
    if not ok:
-        return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs)
+        return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
    logs.append(f"✅ {msg}")
-    return raw, polished, gr.update(value=wav_path), "\n".join(logs)
+    return (
        raw,
        polished,
        gr.update(value=wav_path),
        "\n".join(logs),
        ui_history_dropdown(wav_path),
        gr.update(value=wav_path),
    )
 # ---------------------------------------------------------------------------
@@ -918,6 +954,25 @@ gradio-app,
 .gradio-container .waveform-container {
    background: #1a2332 !important;
 }
 /* 成品播放器：去掉 Gradio 默认 focus 白框，减轻合成完成时闪一下 */
 .gradio-container .tts-output-audio,
 .gradio-container .tts-output-audio .audio-container {
    border: 1px solid #374151 !important;
    background: #1a2332 !important;
    contain: strict;
    min-height: 120px;
 }
 .gradio-container .tts-output-audio button,
 .gradio-container .tts-output-audio button:focus,
 .gradio-container .tts-output-audio button:focus-visible {
    outline: none !important;
    box-shadow: none !important;
    border-color: #4b5563 !important;
 }
 .gradio-container .tts-output-audio .wrap,
 .gradio-container .tts-output-audio .controls {
    background: #1a2332 !important;
 }
 .gradio-container .pipeline-step-card textarea {
    contain: layout style;
 }
@@ -1061,38 +1116,48 @@ def build_app() -> gr.Blocks:
        )
        with gr.Tabs():
-            # ---- Tab 1: 音色锁定 ----
+            # ---- Tab 1: 一键生产（默认首页）----
-            with gr.Tab("🎙️ 音色锁定"):
+            with gr.Tab("🚀 一键生产"):
                gr.HTML(MIC_HINT_HTML)
-                gr.HTML(
+                gr.Markdown(
-                    f'<div class="hint-box">'
+                    "上传碎碎念录音，系统自动完成 **识别 → 润色 → 合成** 全流程。"
                    f'上传 <strong>10-30 秒</strong> 干净人声样本，系统将提取 Speaker Embedding '
                    f'并保存至 <span class="file-tag">{SPEAKER_EMB_PATH.name}</span>，'
                    f'后续合成 <strong>100% 还原音色</strong>。'
                    f"</div>"
                )
                with gr.Row():
-                    spk_audio = gr.Audio(
+                    pipe_audio = gr.Audio(
-                        label="参考人声（碎碎念盲录样本）",
+                        label="复盘录音",
                        type="filepath",
                        sources=["upload", "microphone"],
                    )
-                    spk_transcript = gr.Textbox(
+                    pipe_manual = gr.Textbox(
-                        label="参考音频精确转写（强烈建议填写，与录音一致，避免合成报错）",
+                        label="或手动输入转写（跳过识别）",
-                        placeholder="示例：今天开了三单，第一单手贱提前平了，第二单…",
+                        lines=4,
-                        info="请尽量与参考音频内容完全一致，可提升音色还原度",
+                        placeholder="若已有转写文本，可直接粘贴，留空则走 Whisper 识别",
                        lines=6,
                        elem_classes=["bright-input"],
                    )
-                lock_btn = gr.Button("🔒 锁定音色", variant="primary")
+                skip_polish_cb = gr.Checkbox(
-                lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False)
+                    label="跳过 Gemma4 润色（仅测试 TTS）",
-                lock_btn.click(
+                    value=False,
-                    ui_lock_speaker,
+                )
-                    [spk_audio, spk_transcript],
+                pipe_voice = gr.Radio(
-                    [lock_log, speaker_status],
+                    label="配音音色（本地 ChatTTS）",
                    choices=voice_choice_labels(),
                    value=default_voice_label(),
                    elem_classes=["voice-radio"],
                )
                pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
                pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
                with gr.Row(elem_classes=["pipeline-output-row"]):
                    pipe_raw = gr.Textbox(label="转写原文", lines=6)
                    pipe_polished = gr.Textbox(label="润色稿", lines=6)
                pipe_output = gr.Audio(
                    label="成品配音",
                    type="filepath",
                    interactive=False,
                    show_download_button=True,
                    show_share_button=False,
                    elem_classes=["tts-output-audio"],
                )
-            # ---- Tab 2: 分步操作（纵向三步，避免三栏挤在一起）----
+            # ---- Tab 2: 分步流水线 ----
            with gr.Tab("🔧 分步流水线"):
                gr.HTML(MIC_HINT_HTML)
                with gr.Column(elem_classes=["pipeline-flow"]):
@@ -1126,7 +1191,7 @@ def build_app() -> gr.Blocks:
                            tts_voice = gr.Radio(
                                label="配音音色（本地 ChatTTS）",
                                choices=voice_choice_labels(),
-                                value=_default_voice_label(),
+                                value=default_voice_label(),
                                info="预设音色：bash scripts/generate_voice_presets.sh",
                                elem_classes=["voice-radio"],
                            )
@@ -1137,72 +1202,99 @@ def build_app() -> gr.Blocks:
                        )
                        synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
                        synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
-                        output_audio = gr.Audio(label="成品配音", type="filepath")
+                        output_audio = gr.Audio(
                            label="成品配音",
                            type="filepath",
                            interactive=False,
                            show_download_button=True,
                            show_share_button=False,
                            elem_classes=["tts-output-audio"],
                        )
                transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
                polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
                synth_btn.click(
                    ui_synth_pending,
                    [polished_text],
                    [synth_log, output_audio],
                    queue=True,
                ).then(
                    ui_synthesize,
                    [polished_text, tts_voice],
                    [synth_log, output_audio],
                    queue=True,
                )
-            # ---- Tab 3: 一键生产 ----
+            # ---- Tab 3: 音色锁定 ----
-            with gr.Tab("🚀 一键生产"):
+            with gr.Tab("🎙️ 音色锁定"):
                gr.HTML(MIC_HINT_HTML)
-                gr.Markdown(
+                gr.HTML(
-                    "上传碎碎念录音，系统自动完成 **识别 → 润色 → 合成** 全流程。"
+                    f'<div class="hint-box">'
                    f'上传 <strong>10-30 秒</strong> 干净人声样本，系统将提取 Speaker Embedding '
                    f'并保存至 <span class="file-tag">{SPEAKER_EMB_PATH.name}</span>，'
                    f'后续合成 <strong>100% 还原音色</strong>。'
                    f"</div>"
                )
                with gr.Row():
-                    pipe_audio = gr.Audio(
+                    spk_audio = gr.Audio(
-                        label="复盘录音",
+                        label="参考人声（碎碎念盲录样本）",
                        type="filepath",
                        sources=["upload", "microphone"],
                    )
-                    pipe_manual = gr.Textbox(
+                    spk_transcript = gr.Textbox(
-                        label="或手动输入转写（跳过识别）",
+                        label="参考音频精确转写（强烈建议填写，与录音一致，避免合成报错）",
-                        lines=4,
+                        placeholder="示例：今天开了三单，第一单手贱提前平了，第二单…",
-                        placeholder="若已有转写文本，可直接粘贴，留空则走 Whisper 识别",
+                        info="请尽量与参考音频内容完全一致，可提升音色还原度",
                        lines=6,
                        elem_classes=["bright-input"],
                    )
-                skip_polish_cb = gr.Checkbox(
+                lock_btn = gr.Button("🔒 锁定音色", variant="primary")
-                    label="跳过 Gemma4 润色（仅测试 TTS）",
+                lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False)
-                    value=False,
+                lock_btn.click(
                    ui_lock_speaker,
                    [spk_audio, spk_transcript],
                    [lock_log, speaker_status],
                )
                pipe_voice = gr.Radio(
                    label="配音音色（本地 ChatTTS）",
                    choices=voice_choice_labels(),
                    value=_default_voice_label(),
                    elem_classes=["voice-radio"],
                )
                pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
                pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
                with gr.Row(elem_classes=["pipeline-output-row"]):
                    pipe_raw = gr.Textbox(label="转写原文", lines=6)
                    pipe_polished = gr.Textbox(label="润色稿", lines=6)
                pipe_output = gr.Audio(label="成品配音", type="filepath")
-                def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> tuple[str, dict]:
+        with gr.Accordion("📂 配音历史（本地保留，可随时试听下载）", open=True):
            with gr.Row():
                history_select = gr.Dropdown(
                    label="历史配音",
                    choices=list_voice_history(),
                    value=None,
                    interactive=True,
                    scale=4,
                )
                history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
            history_player = gr.Audio(
                label="历史试听 / 下载",
                type="filepath",
                interactive=False,
                show_download_button=True,
                show_share_button=False,
                elem_classes=["tts-output-audio"],
            )
        history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
        history_select.change(ui_history_play, history_select, history_player)
        demo.load(ui_initial_history, outputs=[history_select, history_player])
        def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> str:
            if manual_raw and manual_raw.strip():
-                        return "⏳ 全流程运行中（识别/润色/合成），请稍候…", gr.update(value=None)
+                return "⏳ 全流程运行中（识别/润色/合成），请稍候，勿刷新页面…"
            if skip_polish:
-                        return "⏳ 全流程运行中（识别→合成），请稍候…", gr.update(value=None)
+                return "⏳ 全流程运行中（识别→合成），请稍候，勿刷新页面…"
-                    return "⏳ 全流程运行中（识别→润色→合成），请稍候…", gr.update(value=None)
+            return "⏳ 全流程运行中（识别→润色→合成），请稍候，勿刷新页面…"
        pipeline_btn.click(
            ui_pipeline_pending,
            [skip_polish_cb, pipe_manual],
-                    [pipeline_log, pipe_output],
+            [pipeline_log],
            queue=True,
        ).then(
            ui_full_pipeline,
            [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
-                    [pipe_raw, pipe_polished, pipe_output, pipeline_log],
+            [pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player],
            queue=True,
        )
        synth_btn.click(
            ui_synth_pending,
            [polished_text],
            [synth_log],
            queue=True,
        ).then(
            ui_synthesize,
            [polished_text, tts_voice],
            [synth_log, output_audio, history_select, history_player],
            queue=True,
        )
@@ -635,7 +635,6 @@ def _concat_wavs(
 def generate_voice(
    refined_text: str,
    voice_id: str = "custom",
    progress_callback=None,
 ) -> Tuple[bool, str, Optional[str]]:
    """
    使用 ChatTTS（本地 GPU）将润色稿合成为 wav。
@@ -725,11 +724,6 @@ def generate_voice(
        for idx, chunk in enumerate(chunks, start=1):
            if not chunk or len(chunk) < 2:
                continue
            if progress_callback is not None:
                try:
                    progress_callback(idx, len(chunks))
                except Exception:
                    logger.debug("TTS 进度回调失败", exc_info=True)
            release_cuda_cache()
            chunk_infer = replace(params_infer_code, manual_seed=42 + idx)
            wavs = None
@@ -0,0 +1,51 @@
 """
 本地配音历史：扫描 outputs/ 下已生成的 wav，供 Gradio 下拉试听与下载。
 文件不会被自动删除，重启服务后仍可访问。
 """
 from __future__ import annotations
 import logging
 from datetime import datetime
 from pathlib import Path
 from typing import List, Tuple
 from config import OUTPUT_DIR
 logger = logging.getLogger(__name__)
 HISTORY_MAX_ITEMS = 50
 VOICEOVER_GLOB = "voiceover_*.wav"
 def list_voice_history(limit: int = HISTORY_MAX_ITEMS) -> List[Tuple[str, str]]:
    """
    返回 Gradio Dropdown 选项：(显示名, 文件绝对路径)，按时间倒序。
    """
    if not OUTPUT_DIR.is_dir():
        return []
    files = sorted(
        OUTPUT_DIR.glob(VOICEOVER_GLOB),
        key=lambda p: p.stat().st_mtime,
        reverse=True,
    )[:limit]
    choices: List[Tuple[str, str]] = []
    for path in files:
        try:
            st = path.stat()
        except OSError:
            logger.debug("跳过不可读历史文件: %s", path)
            continue
        ts = datetime.fromtimestamp(st.st_mtime).strftime("%Y-%m-%d %H:%M")
        size_mb = st.st_size / (1024 * 1024)
        label = f"{ts} · {path.name} ({size_mb:.1f} MB)"
        choices.append((label, str(path.resolve())))
    return choices
 def latest_voice_path() -> str | None:
    """最新一条配音路径，无历史时返回 None。"""
    items = list_voice_history(limit=1)
    return items[0][1] if items else None
@@ -22,6 +22,8 @@ PRESETS_DIR = VOICES_DIR / "presets"
 MANIFEST_PATH = VOICES_DIR / "manifest.json"
 CUSTOM_VOICE_ID = "custom"
 DEFAULT_PRESET_VOICE_ID = "preset_01"
 DEFAULT_PRESET_VOICE_LABEL = "预设·沉稳男声"
 # 生成脚本写入的预设元数据（.pt 文件不入 Git）
 DEFAULT_MANIFEST = {
@@ -85,13 +87,24 @@ def list_voice_choices() -> List[Tuple[str, str]]:
 def default_voice_id() -> str:
    choices = list_voice_choices()
    if not choices:
-        return CUSTOM_VOICE_ID
+        return DEFAULT_PRESET_VOICE_ID
    for _label, vid in choices:
-        if vid == CUSTOM_VOICE_ID:
+        if vid == DEFAULT_PRESET_VOICE_ID:
-            return CUSTOM_VOICE_ID
+            return vid
    for _label, vid in choices:
        if vid != CUSTOM_VOICE_ID:
            return vid
    return choices[0][1]
 def default_voice_label() -> str:
    for lbl, vid in list_voice_choices():
        if vid == DEFAULT_PRESET_VOICE_ID:
            return lbl
    labels = voice_choice_labels()
    return labels[0] if labels else DEFAULT_PRESET_VOICE_LABEL
 def voice_choice_labels() -> List[str]:
    return [c[0] for c in list_voice_choices()]