From bdc63c04df17c417a01beb8afd9439d655e4fb49 Mon Sep 17 00:00:00 2001 From: dekun Date: Fri, 12 Jun 2026 18:37:53 +0800 Subject: [PATCH] Add voice history, default preset voice, and one-click tab Keep synthesized wav files browsable with playback and download, default to preset steady male voice, show one-click pipeline as the first tab, and reduce post-synthesis UI flicker. Co-authored-by: Cursor --- app.py | 312 ++++++++++++++++++++++++++++++----------------- tts_service.py | 6 - voice_history.py | 51 ++++++++ voice_presets.py | 19 ++- 4 files changed, 269 insertions(+), 119 deletions(-) create mode 100644 voice_history.py diff --git a/app.py b/app.py index 68cbb07..0cf1002 100644 --- a/app.py +++ b/app.py @@ -25,7 +25,8 @@ from config import ( ) from llm_service import check_ollama_health, polish_text from tts_service import generate_voice, save_fixed_speaker, speaker_is_ready -from voice_presets import label_to_voice_id, voice_choice_labels +from voice_history import list_voice_history +from voice_presets import default_voice_label, label_to_voice_id, voice_choice_labels from whisper_service import transcribe_audio # --------------------------------------------------------------------------- @@ -42,9 +43,35 @@ logging.basicConfig( logger = logging.getLogger("trading_studio") -def _default_voice_label() -> str: - labels = voice_choice_labels() - return labels[0] if labels else "我的锁定音色(声音克隆)" +# --------------------------------------------------------------------------- +# 配音历史 +# --------------------------------------------------------------------------- +def ui_history_dropdown(select_path: str | None = None) -> dict: + """刷新历史下拉列表;可选选中指定路径(合成完成后传入新文件)。""" + choices = list_voice_history() + paths = [p for _, p in choices] + if select_path and select_path in paths: + value = select_path + elif paths: + value = paths[0] + else: + value = None + return gr.update(choices=choices, value=value) + + +def ui_history_play(filepath: str | None) -> dict: + """选中历史条目后加载播放器。""" + if filepath and Path(filepath).is_file(): + return gr.update(value=filepath) + return gr.update(value=None) + + +def ui_initial_history() -> tuple[dict, dict]: + """首屏加载历史列表并自动选中最新一条。""" + choices = list_voice_history() + paths = [p for _, p in choices] + latest = paths[0] if paths else None + return gr.update(choices=choices, value=latest), ui_history_play(latest) # --------------------------------------------------------------------------- @@ -141,40 +168,42 @@ def _short_synth_log(msg: str, ok: bool) -> str: return "✅ 配音完成。请用下方播放器试听、下载。" -def ui_synth_pending(polished_text: str) -> tuple[str, dict]: - """点击合成后立即反馈,避免长时间无日志更新被误认为卡死。""" +def ui_synth_pending(polished_text: str) -> str: + """点击合成后立即更新日志;不触碰播放器,避免波形组件销毁重建导致闪屏。""" text = (polished_text or "").strip() if not text: - return "请先完成 Gemma4 润色。", gr.update(value=None) + return "请先完成 Gemma4 润色。" est_sec = max(20, len(text) // 10) return ( - f"⏳ 配音合成中(约 {len(text)} 字,预计 {est_sec}–{est_sec + 45} 秒),请勿重复点击…", - gr.update(value=None), + f"⏳ 配音合成中(约 {len(text)} 字,预计 {est_sec}–{est_sec + 45} 秒),请勿重复点击或刷新页面…" ) -def ui_synthesize( - polished_text: str, - voice_label: str, - progress: gr.Progress = gr.Progress(), -) -> tuple[str, dict]: +def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]: """【TTS 合成】生成最终 wav 配音文件。""" if not polished_text or not polished_text.strip(): - return "请先完成 Gemma4 润色。", gr.update(value=None) + return ( + "请先完成 Gemma4 润色。", + gr.update(value=None), + ui_history_dropdown(), + gr.update(value=None), + ) voice_id = label_to_voice_id(voice_label) - - def _report_segment(seg: int, total: int) -> None: - progress((seg - 1) / max(total, 1), desc=f"ChatTTS 第 {seg}/{total} 段…") - - ok, msg, wav_path = generate_voice( - polished_text, - voice_id=voice_id, - progress_callback=_report_segment, - ) + ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id) if ok: - return _short_synth_log(msg, ok), gr.update(value=wav_path) - return _short_synth_log(msg, ok), gr.update(value=None) + return ( + _short_synth_log(msg, ok), + gr.update(value=wav_path), + ui_history_dropdown(wav_path), + gr.update(value=wav_path), + ) + return ( + _short_synth_log(msg, ok), + gr.update(value=None), + ui_history_dropdown(), + gr.update(value=None), + ) # --------------------------------------------------------------------------- @@ -185,7 +214,7 @@ def ui_full_pipeline( skip_polish: bool, manual_raw: str, voice_label: str, -) -> tuple[str, str, str | None, str]: +) -> tuple[str, str, dict, str, dict, dict]: """ 串联执行:识别 → 润色(可跳过)→ 合成。 返回 (raw, polished, wav_path, log) @@ -199,10 +228,10 @@ def ui_full_pipeline( else: path = _save_upload(audio_file) if not path: - return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。" + return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None) ok, result = transcribe_audio(path) if not ok: - return "", "", gr.update(value=None), f"❌ 识别失败: {result}" + return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None) raw = result logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。") @@ -213,7 +242,7 @@ def ui_full_pipeline( else: ok, result = polish_text(raw) if not ok: - return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs) + return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None) polished = result logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。") @@ -221,10 +250,17 @@ def ui_full_pipeline( voice_id = label_to_voice_id(voice_label) ok, msg, wav_path = generate_voice(polished, voice_id=voice_id) if not ok: - return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs) + return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None) logs.append(f"✅ {msg}") - return raw, polished, gr.update(value=wav_path), "\n".join(logs) + return ( + raw, + polished, + gr.update(value=wav_path), + "\n".join(logs), + ui_history_dropdown(wav_path), + gr.update(value=wav_path), + ) # --------------------------------------------------------------------------- @@ -918,6 +954,25 @@ gradio-app, .gradio-container .waveform-container { background: #1a2332 !important; } +/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */ +.gradio-container .tts-output-audio, +.gradio-container .tts-output-audio .audio-container { + border: 1px solid #374151 !important; + background: #1a2332 !important; + contain: strict; + min-height: 120px; +} +.gradio-container .tts-output-audio button, +.gradio-container .tts-output-audio button:focus, +.gradio-container .tts-output-audio button:focus-visible { + outline: none !important; + box-shadow: none !important; + border-color: #4b5563 !important; +} +.gradio-container .tts-output-audio .wrap, +.gradio-container .tts-output-audio .controls { + background: #1a2332 !important; +} .gradio-container .pipeline-step-card textarea { contain: layout style; } @@ -1061,38 +1116,48 @@ def build_app() -> gr.Blocks: ) with gr.Tabs(): - # ---- Tab 1: 音色锁定 ---- - with gr.Tab("🎙️ 音色锁定"): + # ---- Tab 1: 一键生产(默认首页)---- + with gr.Tab("🚀 一键生产"): gr.HTML(MIC_HINT_HTML) - gr.HTML( - f'
' - f'上传 10-30 秒 干净人声样本,系统将提取 Speaker Embedding ' - f'并保存至 {SPEAKER_EMB_PATH.name},' - f'后续合成 100% 还原音色。' - f"
" + gr.Markdown( + "上传碎碎念录音,系统自动完成 **识别 → 润色 → 合成** 全流程。" ) with gr.Row(): - spk_audio = gr.Audio( - label="参考人声(碎碎念盲录样本)", + pipe_audio = gr.Audio( + label="复盘录音", type="filepath", sources=["upload", "microphone"], ) - spk_transcript = gr.Textbox( - label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错)", - placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…", - info="请尽量与参考音频内容完全一致,可提升音色还原度", - lines=6, - elem_classes=["bright-input"], + pipe_manual = gr.Textbox( + label="或手动输入转写(跳过识别)", + lines=4, + placeholder="若已有转写文本,可直接粘贴,留空则走 Whisper 识别", ) - lock_btn = gr.Button("🔒 锁定音色", variant="primary") - lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False) - lock_btn.click( - ui_lock_speaker, - [spk_audio, spk_transcript], - [lock_log, speaker_status], + skip_polish_cb = gr.Checkbox( + label="跳过 Gemma4 润色(仅测试 TTS)", + value=False, + ) + pipe_voice = gr.Radio( + label="配音音色(本地 ChatTTS)", + choices=voice_choice_labels(), + value=default_voice_label(), + elem_classes=["voice-radio"], + ) + pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg") + pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False) + with gr.Row(elem_classes=["pipeline-output-row"]): + pipe_raw = gr.Textbox(label="转写原文", lines=6) + pipe_polished = gr.Textbox(label="润色稿", lines=6) + pipe_output = gr.Audio( + label="成品配音", + type="filepath", + interactive=False, + show_download_button=True, + show_share_button=False, + elem_classes=["tts-output-audio"], ) - # ---- Tab 2: 分步操作(纵向三步,避免三栏挤在一起)---- + # ---- Tab 2: 分步流水线 ---- with gr.Tab("🔧 分步流水线"): gr.HTML(MIC_HINT_HTML) with gr.Column(elem_classes=["pipeline-flow"]): @@ -1126,7 +1191,7 @@ def build_app() -> gr.Blocks: tts_voice = gr.Radio( label="配音音色(本地 ChatTTS)", choices=voice_choice_labels(), - value=_default_voice_label(), + value=default_voice_label(), info="预设音色:bash scripts/generate_voice_presets.sh", elem_classes=["voice-radio"], ) @@ -1137,74 +1202,101 @@ def build_app() -> gr.Blocks: ) synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary") synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False) - output_audio = gr.Audio(label="成品配音", type="filepath") + output_audio = gr.Audio( + label="成品配音", + type="filepath", + interactive=False, + show_download_button=True, + show_share_button=False, + elem_classes=["tts-output-audio"], + ) transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log]) polish_btn.click(ui_polish, raw_text, [polished_text, polish_log]) - synth_btn.click( - ui_synth_pending, - [polished_text], - [synth_log, output_audio], - queue=True, - ).then( - ui_synthesize, - [polished_text, tts_voice], - [synth_log, output_audio], - queue=True, - ) - # ---- Tab 3: 一键生产 ---- - with gr.Tab("🚀 一键生产"): + # ---- Tab 3: 音色锁定 ---- + with gr.Tab("🎙️ 音色锁定"): gr.HTML(MIC_HINT_HTML) - gr.Markdown( - "上传碎碎念录音,系统自动完成 **识别 → 润色 → 合成** 全流程。" + gr.HTML( + f'
' + f'上传 10-30 秒 干净人声样本,系统将提取 Speaker Embedding ' + f'并保存至 {SPEAKER_EMB_PATH.name},' + f'后续合成 100% 还原音色。' + f"
" ) with gr.Row(): - pipe_audio = gr.Audio( - label="复盘录音", + spk_audio = gr.Audio( + label="参考人声(碎碎念盲录样本)", type="filepath", sources=["upload", "microphone"], ) - pipe_manual = gr.Textbox( - label="或手动输入转写(跳过识别)", - lines=4, - placeholder="若已有转写文本,可直接粘贴,留空则走 Whisper 识别", + spk_transcript = gr.Textbox( + label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错)", + placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…", + info="请尽量与参考音频内容完全一致,可提升音色还原度", + lines=6, + elem_classes=["bright-input"], ) - skip_polish_cb = gr.Checkbox( - label="跳过 Gemma4 润色(仅测试 TTS)", - value=False, + lock_btn = gr.Button("🔒 锁定音色", variant="primary") + lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False) + lock_btn.click( + ui_lock_speaker, + [spk_audio, spk_transcript], + [lock_log, speaker_status], ) - pipe_voice = gr.Radio( - label="配音音色(本地 ChatTTS)", - choices=voice_choice_labels(), - value=_default_voice_label(), - elem_classes=["voice-radio"], - ) - pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg") - pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False) - with gr.Row(elem_classes=["pipeline-output-row"]): - pipe_raw = gr.Textbox(label="转写原文", lines=6) - pipe_polished = gr.Textbox(label="润色稿", lines=6) - pipe_output = gr.Audio(label="成品配音", type="filepath") - def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> tuple[str, dict]: - if manual_raw and manual_raw.strip(): - return "⏳ 全流程运行中(识别/润色/合成),请稍候…", gr.update(value=None) - if skip_polish: - return "⏳ 全流程运行中(识别→合成),请稍候…", gr.update(value=None) - return "⏳ 全流程运行中(识别→润色→合成),请稍候…", gr.update(value=None) - - pipeline_btn.click( - ui_pipeline_pending, - [skip_polish_cb, pipe_manual], - [pipeline_log, pipe_output], - queue=True, - ).then( - ui_full_pipeline, - [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice], - [pipe_raw, pipe_polished, pipe_output, pipeline_log], - queue=True, + with gr.Accordion("📂 配音历史(本地保留,可随时试听下载)", open=True): + with gr.Row(): + history_select = gr.Dropdown( + label="历史配音", + choices=list_voice_history(), + value=None, + interactive=True, + scale=4, ) + history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100) + history_player = gr.Audio( + label="历史试听 / 下载", + type="filepath", + interactive=False, + show_download_button=True, + show_share_button=False, + elem_classes=["tts-output-audio"], + ) + + history_refresh_btn.click(ui_history_dropdown, outputs=[history_select]) + history_select.change(ui_history_play, history_select, history_player) + demo.load(ui_initial_history, outputs=[history_select, history_player]) + + def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> str: + if manual_raw and manual_raw.strip(): + return "⏳ 全流程运行中(识别/润色/合成),请稍候,勿刷新页面…" + if skip_polish: + return "⏳ 全流程运行中(识别→合成),请稍候,勿刷新页面…" + return "⏳ 全流程运行中(识别→润色→合成),请稍候,勿刷新页面…" + + pipeline_btn.click( + ui_pipeline_pending, + [skip_polish_cb, pipe_manual], + [pipeline_log], + queue=True, + ).then( + ui_full_pipeline, + [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice], + [pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player], + queue=True, + ) + synth_btn.click( + ui_synth_pending, + [polished_text], + [synth_log], + queue=True, + ).then( + ui_synthesize, + [polished_text, tts_voice], + [synth_log, output_audio, history_select, history_player], + queue=True, + ) demo.queue(default_concurrency_limit=1) return demo diff --git a/tts_service.py b/tts_service.py index 586906c..8a1d81c 100644 --- a/tts_service.py +++ b/tts_service.py @@ -635,7 +635,6 @@ def _concat_wavs( def generate_voice( refined_text: str, voice_id: str = "custom", - progress_callback=None, ) -> Tuple[bool, str, Optional[str]]: """ 使用 ChatTTS(本地 GPU)将润色稿合成为 wav。 @@ -725,11 +724,6 @@ def generate_voice( for idx, chunk in enumerate(chunks, start=1): if not chunk or len(chunk) < 2: continue - if progress_callback is not None: - try: - progress_callback(idx, len(chunks)) - except Exception: - logger.debug("TTS 进度回调失败", exc_info=True) release_cuda_cache() chunk_infer = replace(params_infer_code, manual_seed=42 + idx) wavs = None diff --git a/voice_history.py b/voice_history.py new file mode 100644 index 0000000..080f734 --- /dev/null +++ b/voice_history.py @@ -0,0 +1,51 @@ +""" +本地配音历史:扫描 outputs/ 下已生成的 wav,供 Gradio 下拉试听与下载。 +文件不会被自动删除,重启服务后仍可访问。 +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from pathlib import Path +from typing import List, Tuple + +from config import OUTPUT_DIR + +logger = logging.getLogger(__name__) + +HISTORY_MAX_ITEMS = 50 +VOICEOVER_GLOB = "voiceover_*.wav" + + +def list_voice_history(limit: int = HISTORY_MAX_ITEMS) -> List[Tuple[str, str]]: + """ + 返回 Gradio Dropdown 选项:(显示名, 文件绝对路径),按时间倒序。 + """ + if not OUTPUT_DIR.is_dir(): + return [] + + files = sorted( + OUTPUT_DIR.glob(VOICEOVER_GLOB), + key=lambda p: p.stat().st_mtime, + reverse=True, + )[:limit] + + choices: List[Tuple[str, str]] = [] + for path in files: + try: + st = path.stat() + except OSError: + logger.debug("跳过不可读历史文件: %s", path) + continue + ts = datetime.fromtimestamp(st.st_mtime).strftime("%Y-%m-%d %H:%M") + size_mb = st.st_size / (1024 * 1024) + label = f"{ts} · {path.name} ({size_mb:.1f} MB)" + choices.append((label, str(path.resolve()))) + return choices + + +def latest_voice_path() -> str | None: + """最新一条配音路径,无历史时返回 None。""" + items = list_voice_history(limit=1) + return items[0][1] if items else None diff --git a/voice_presets.py b/voice_presets.py index d1067f8..33da74e 100644 --- a/voice_presets.py +++ b/voice_presets.py @@ -22,6 +22,8 @@ PRESETS_DIR = VOICES_DIR / "presets" MANIFEST_PATH = VOICES_DIR / "manifest.json" CUSTOM_VOICE_ID = "custom" +DEFAULT_PRESET_VOICE_ID = "preset_01" +DEFAULT_PRESET_VOICE_LABEL = "预设·沉稳男声" # 生成脚本写入的预设元数据(.pt 文件不入 Git) DEFAULT_MANIFEST = { @@ -85,13 +87,24 @@ def list_voice_choices() -> List[Tuple[str, str]]: def default_voice_id() -> str: choices = list_voice_choices() if not choices: - return CUSTOM_VOICE_ID + return DEFAULT_PRESET_VOICE_ID for _label, vid in choices: - if vid == CUSTOM_VOICE_ID: - return CUSTOM_VOICE_ID + if vid == DEFAULT_PRESET_VOICE_ID: + return vid + for _label, vid in choices: + if vid != CUSTOM_VOICE_ID: + return vid return choices[0][1] +def default_voice_label() -> str: + for lbl, vid in list_voice_choices(): + if vid == DEFAULT_PRESET_VOICE_ID: + return lbl + labels = voice_choice_labels() + return labels[0] if labels else DEFAULT_PRESET_VOICE_LABEL + + def voice_choice_labels() -> List[str]: return [c[0] for c in list_voice_choices()]