Add voice history, default preset voice, and one-click tab

Keep synthesized wav files browsable with playback and download, default to preset steady male voice, show one-click pipeline as the first tab, and reduce post-synthesis UI flicker.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
dekun
2026-06-12 18:37:53 +08:00
parent 7c50b13c57
commit bdc63c04df
4 changed files with 269 additions and 119 deletions
+194 -102
View File
@@ -25,7 +25,8 @@ from config import (
)
from llm_service import check_ollama_health, polish_text
from tts_service import generate_voice, save_fixed_speaker, speaker_is_ready
from voice_presets import label_to_voice_id, voice_choice_labels
from voice_history import list_voice_history
from voice_presets import default_voice_label, label_to_voice_id, voice_choice_labels
from whisper_service import transcribe_audio
# ---------------------------------------------------------------------------
@@ -42,9 +43,35 @@ logging.basicConfig(
logger = logging.getLogger("trading_studio")
def _default_voice_label() -> str:
labels = voice_choice_labels()
return labels[0] if labels else "我的锁定音色(声音克隆)"
# ---------------------------------------------------------------------------
# 配音历史
# ---------------------------------------------------------------------------
def ui_history_dropdown(select_path: str | None = None) -> dict:
"""刷新历史下拉列表;可选选中指定路径(合成完成后传入新文件)。"""
choices = list_voice_history()
paths = [p for _, p in choices]
if select_path and select_path in paths:
value = select_path
elif paths:
value = paths[0]
else:
value = None
return gr.update(choices=choices, value=value)
def ui_history_play(filepath: str | None) -> dict:
"""选中历史条目后加载播放器。"""
if filepath and Path(filepath).is_file():
return gr.update(value=filepath)
return gr.update(value=None)
def ui_initial_history() -> tuple[dict, dict]:
"""首屏加载历史列表并自动选中最新一条。"""
choices = list_voice_history()
paths = [p for _, p in choices]
latest = paths[0] if paths else None
return gr.update(choices=choices, value=latest), ui_history_play(latest)
# ---------------------------------------------------------------------------
@@ -141,40 +168,42 @@ def _short_synth_log(msg: str, ok: bool) -> str:
return "✅ 配音完成。请用下方播放器试听、下载。"
def ui_synth_pending(polished_text: str) -> tuple[str, dict]:
"""点击合成后立即反馈,避免长时间无日志更新被误认为卡死"""
def ui_synth_pending(polished_text: str) -> str:
"""点击合成后立即更新日志;不触碰播放器,避免波形组件销毁重建导致闪屏"""
text = (polished_text or "").strip()
if not text:
return "请先完成 Gemma4 润色。", gr.update(value=None)
return "请先完成 Gemma4 润色。"
est_sec = max(20, len(text) // 10)
return (
f"⏳ 配音合成中(约 {len(text)} 字,预计 {est_sec}{est_sec + 45} 秒),请勿重复点击…",
f"⏳ 配音合成中(约 {len(text)} 字,预计 {est_sec}{est_sec + 45} 秒),请勿重复点击或刷新页面"
)
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]:
"""【TTS 合成】生成最终 wav 配音文件。"""
if not polished_text or not polished_text.strip():
return (
"请先完成 Gemma4 润色。",
gr.update(value=None),
ui_history_dropdown(),
gr.update(value=None),
)
def ui_synthesize(
polished_text: str,
voice_label: str,
progress: gr.Progress = gr.Progress(),
) -> tuple[str, dict]:
"""【TTS 合成】生成最终 wav 配音文件。"""
if not polished_text or not polished_text.strip():
return "请先完成 Gemma4 润色。", gr.update(value=None)
voice_id = label_to_voice_id(voice_label)
def _report_segment(seg: int, total: int) -> None:
progress((seg - 1) / max(total, 1), desc=f"ChatTTS 第 {seg}/{total} 段…")
ok, msg, wav_path = generate_voice(
polished_text,
voice_id=voice_id,
progress_callback=_report_segment,
)
ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id)
if ok:
return _short_synth_log(msg, ok), gr.update(value=wav_path)
return _short_synth_log(msg, ok), gr.update(value=None)
return (
_short_synth_log(msg, ok),
gr.update(value=wav_path),
ui_history_dropdown(wav_path),
gr.update(value=wav_path),
)
return (
_short_synth_log(msg, ok),
gr.update(value=None),
ui_history_dropdown(),
gr.update(value=None),
)
# ---------------------------------------------------------------------------
@@ -185,7 +214,7 @@ def ui_full_pipeline(
skip_polish: bool,
manual_raw: str,
voice_label: str,
) -> tuple[str, str, str | None, str]:
) -> tuple[str, str, dict, str, dict, dict]:
"""
串联执行:识别 → 润色(可跳过)→ 合成。
返回 (raw, polished, wav_path, log)
@@ -199,10 +228,10 @@ def ui_full_pipeline(
else:
path = _save_upload(audio_file)
if not path:
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。"
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None)
ok, result = transcribe_audio(path)
if not ok:
return "", "", gr.update(value=None), f"❌ 识别失败: {result}"
return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None)
raw = result
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
@@ -213,7 +242,7 @@ def ui_full_pipeline(
else:
ok, result = polish_text(raw)
if not ok:
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs)
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
polished = result
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
@@ -221,10 +250,17 @@ def ui_full_pipeline(
voice_id = label_to_voice_id(voice_label)
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
if not ok:
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs)
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
logs.append(f"{msg}")
return raw, polished, gr.update(value=wav_path), "\n".join(logs)
return (
raw,
polished,
gr.update(value=wav_path),
"\n".join(logs),
ui_history_dropdown(wav_path),
gr.update(value=wav_path),
)
# ---------------------------------------------------------------------------
@@ -918,6 +954,25 @@ gradio-app,
.gradio-container .waveform-container {
background: #1a2332 !important;
}
/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */
.gradio-container .tts-output-audio,
.gradio-container .tts-output-audio .audio-container {
border: 1px solid #374151 !important;
background: #1a2332 !important;
contain: strict;
min-height: 120px;
}
.gradio-container .tts-output-audio button,
.gradio-container .tts-output-audio button:focus,
.gradio-container .tts-output-audio button:focus-visible {
outline: none !important;
box-shadow: none !important;
border-color: #4b5563 !important;
}
.gradio-container .tts-output-audio .wrap,
.gradio-container .tts-output-audio .controls {
background: #1a2332 !important;
}
.gradio-container .pipeline-step-card textarea {
contain: layout style;
}
@@ -1061,38 +1116,48 @@ def build_app() -> gr.Blocks:
)
with gr.Tabs():
# ---- Tab 1: 音色锁定 ----
with gr.Tab("🎙️ 音色锁定"):
# ---- Tab 1: 一键生产(默认首页)----
with gr.Tab("🚀 一键生产"):
gr.HTML(MIC_HINT_HTML)
gr.HTML(
f'<div class="hint-box">'
f'上传 <strong>10-30 秒</strong> 干净人声样本,系统将提取 Speaker Embedding '
f'并保存至 <span class="file-tag">{SPEAKER_EMB_PATH.name}</span>'
f'后续合成 <strong>100% 还原音色</strong>。'
f"</div>"
gr.Markdown(
"上传碎碎念录音,系统自动完成 **识别 → 润色 → 合成** 全流程。"
)
with gr.Row():
spk_audio = gr.Audio(
label="参考人声(碎碎念盲录样本)",
pipe_audio = gr.Audio(
label="复盘录音",
type="filepath",
sources=["upload", "microphone"],
)
spk_transcript = gr.Textbox(
label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错",
placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…",
info="请尽量与参考音频内容完全一致,可提升音色还原度",
lines=6,
elem_classes=["bright-input"],
pipe_manual = gr.Textbox(
label="或手动输入转写(跳过识别",
lines=4,
placeholder="若已有转写文本,可直接粘贴,留空则走 Whisper 识别",
)
lock_btn = gr.Button("🔒 锁定音色", variant="primary")
lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False)
lock_btn.click(
ui_lock_speaker,
[spk_audio, spk_transcript],
[lock_log, speaker_status],
skip_polish_cb = gr.Checkbox(
label="跳过 Gemma4 润色(仅测试 TTS)",
value=False,
)
pipe_voice = gr.Radio(
label="配音音色(本地 ChatTTS",
choices=voice_choice_labels(),
value=default_voice_label(),
elem_classes=["voice-radio"],
)
pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
with gr.Row(elem_classes=["pipeline-output-row"]):
pipe_raw = gr.Textbox(label="转写原文", lines=6)
pipe_polished = gr.Textbox(label="润色稿", lines=6)
pipe_output = gr.Audio(
label="成品配音",
type="filepath",
interactive=False,
show_download_button=True,
show_share_button=False,
elem_classes=["tts-output-audio"],
)
# ---- Tab 2: 分步操作(纵向三步,避免三栏挤在一起)----
# ---- Tab 2: 分步流水线 ----
with gr.Tab("🔧 分步流水线"):
gr.HTML(MIC_HINT_HTML)
with gr.Column(elem_classes=["pipeline-flow"]):
@@ -1126,7 +1191,7 @@ def build_app() -> gr.Blocks:
tts_voice = gr.Radio(
label="配音音色(本地 ChatTTS",
choices=voice_choice_labels(),
value=_default_voice_label(),
value=default_voice_label(),
info="预设音色:bash scripts/generate_voice_presets.sh",
elem_classes=["voice-radio"],
)
@@ -1137,72 +1202,99 @@ def build_app() -> gr.Blocks:
)
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
output_audio = gr.Audio(label="成品配音", type="filepath")
output_audio = gr.Audio(
label="成品配音",
type="filepath",
interactive=False,
show_download_button=True,
show_share_button=False,
elem_classes=["tts-output-audio"],
)
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
synth_btn.click(
ui_synth_pending,
[polished_text],
[synth_log, output_audio],
queue=True,
).then(
ui_synthesize,
[polished_text, tts_voice],
[synth_log, output_audio],
queue=True,
)
# ---- Tab 3: 一键生产 ----
with gr.Tab("🚀 一键生产"):
# ---- Tab 3: 音色锁定 ----
with gr.Tab("🎙️ 音色锁定"):
gr.HTML(MIC_HINT_HTML)
gr.Markdown(
"上传碎碎念录音,系统自动完成 **识别 → 润色 → 合成** 全流程。"
gr.HTML(
f'<div class="hint-box">'
f'上传 <strong>10-30 秒</strong> 干净人声样本,系统将提取 Speaker Embedding '
f'并保存至 <span class="file-tag">{SPEAKER_EMB_PATH.name}</span>'
f'后续合成 <strong>100% 还原音色</strong>。'
f"</div>"
)
with gr.Row():
pipe_audio = gr.Audio(
label="复盘录音",
spk_audio = gr.Audio(
label="参考人声(碎碎念盲录样本)",
type="filepath",
sources=["upload", "microphone"],
)
pipe_manual = gr.Textbox(
label="或手动输入转写(跳过识别",
lines=4,
placeholder="若已有转写文本,可直接粘贴,留空则走 Whisper 识别",
spk_transcript = gr.Textbox(
label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错",
placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…",
info="请尽量与参考音频内容完全一致,可提升音色还原度",
lines=6,
elem_classes=["bright-input"],
)
skip_polish_cb = gr.Checkbox(
label="跳过 Gemma4 润色(仅测试 TTS)",
value=False,
lock_btn = gr.Button("🔒 锁定音色", variant="primary")
lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False)
lock_btn.click(
ui_lock_speaker,
[spk_audio, spk_transcript],
[lock_log, speaker_status],
)
pipe_voice = gr.Radio(
label="配音音色(本地 ChatTTS",
choices=voice_choice_labels(),
value=_default_voice_label(),
elem_classes=["voice-radio"],
)
pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
with gr.Row(elem_classes=["pipeline-output-row"]):
pipe_raw = gr.Textbox(label="转写原文", lines=6)
pipe_polished = gr.Textbox(label="润色稿", lines=6)
pipe_output = gr.Audio(label="成品配音", type="filepath")
def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> tuple[str, dict]:
with gr.Accordion("📂 配音历史(本地保留,可随时试听下载)", open=True):
with gr.Row():
history_select = gr.Dropdown(
label="历史配音",
choices=list_voice_history(),
value=None,
interactive=True,
scale=4,
)
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
history_player = gr.Audio(
label="历史试听 / 下载",
type="filepath",
interactive=False,
show_download_button=True,
show_share_button=False,
elem_classes=["tts-output-audio"],
)
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
history_select.change(ui_history_play, history_select, history_player)
demo.load(ui_initial_history, outputs=[history_select, history_player])
def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> str:
if manual_raw and manual_raw.strip():
return "⏳ 全流程运行中(识别/润色/合成),请稍候…", gr.update(value=None)
return "⏳ 全流程运行中(识别/润色/合成),请稍候,勿刷新页面"
if skip_polish:
return "⏳ 全流程运行中(识别→合成),请稍候…", gr.update(value=None)
return "⏳ 全流程运行中(识别→润色→合成),请稍候…", gr.update(value=None)
return "⏳ 全流程运行中(识别→合成),请稍候,勿刷新页面"
return "⏳ 全流程运行中(识别→润色→合成),请稍候,勿刷新页面"
pipeline_btn.click(
ui_pipeline_pending,
[skip_polish_cb, pipe_manual],
[pipeline_log, pipe_output],
[pipeline_log],
queue=True,
).then(
ui_full_pipeline,
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
[pipe_raw, pipe_polished, pipe_output, pipeline_log],
[pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player],
queue=True,
)
synth_btn.click(
ui_synth_pending,
[polished_text],
[synth_log],
queue=True,
).then(
ui_synthesize,
[polished_text, tts_voice],
[synth_log, output_audio, history_select, history_player],
queue=True,
)
-6
View File
@@ -635,7 +635,6 @@ def _concat_wavs(
def generate_voice(
refined_text: str,
voice_id: str = "custom",
progress_callback=None,
) -> Tuple[bool, str, Optional[str]]:
"""
使用 ChatTTS(本地 GPU)将润色稿合成为 wav。
@@ -725,11 +724,6 @@ def generate_voice(
for idx, chunk in enumerate(chunks, start=1):
if not chunk or len(chunk) < 2:
continue
if progress_callback is not None:
try:
progress_callback(idx, len(chunks))
except Exception:
logger.debug("TTS 进度回调失败", exc_info=True)
release_cuda_cache()
chunk_infer = replace(params_infer_code, manual_seed=42 + idx)
wavs = None
+51
View File
@@ -0,0 +1,51 @@
"""
本地配音历史:扫描 outputs/ 下已生成的 wav,供 Gradio 下拉试听与下载。
文件不会被自动删除,重启服务后仍可访问。
"""
from __future__ import annotations
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Tuple
from config import OUTPUT_DIR
logger = logging.getLogger(__name__)
HISTORY_MAX_ITEMS = 50
VOICEOVER_GLOB = "voiceover_*.wav"
def list_voice_history(limit: int = HISTORY_MAX_ITEMS) -> List[Tuple[str, str]]:
"""
返回 Gradio Dropdown 选项:(显示名, 文件绝对路径),按时间倒序。
"""
if not OUTPUT_DIR.is_dir():
return []
files = sorted(
OUTPUT_DIR.glob(VOICEOVER_GLOB),
key=lambda p: p.stat().st_mtime,
reverse=True,
)[:limit]
choices: List[Tuple[str, str]] = []
for path in files:
try:
st = path.stat()
except OSError:
logger.debug("跳过不可读历史文件: %s", path)
continue
ts = datetime.fromtimestamp(st.st_mtime).strftime("%Y-%m-%d %H:%M")
size_mb = st.st_size / (1024 * 1024)
label = f"{ts} · {path.name} ({size_mb:.1f} MB)"
choices.append((label, str(path.resolve())))
return choices
def latest_voice_path() -> str | None:
"""最新一条配音路径,无历史时返回 None。"""
items = list_voice_history(limit=1)
return items[0][1] if items else None
+16 -3
View File
@@ -22,6 +22,8 @@ PRESETS_DIR = VOICES_DIR / "presets"
MANIFEST_PATH = VOICES_DIR / "manifest.json"
CUSTOM_VOICE_ID = "custom"
DEFAULT_PRESET_VOICE_ID = "preset_01"
DEFAULT_PRESET_VOICE_LABEL = "预设·沉稳男声"
# 生成脚本写入的预设元数据(.pt 文件不入 Git)
DEFAULT_MANIFEST = {
@@ -85,13 +87,24 @@ def list_voice_choices() -> List[Tuple[str, str]]:
def default_voice_id() -> str:
choices = list_voice_choices()
if not choices:
return CUSTOM_VOICE_ID
return DEFAULT_PRESET_VOICE_ID
for _label, vid in choices:
if vid == CUSTOM_VOICE_ID:
return CUSTOM_VOICE_ID
if vid == DEFAULT_PRESET_VOICE_ID:
return vid
for _label, vid in choices:
if vid != CUSTOM_VOICE_ID:
return vid
return choices[0][1]
def default_voice_label() -> str:
for lbl, vid in list_voice_choices():
if vid == DEFAULT_PRESET_VOICE_ID:
return lbl
labels = voice_choice_labels()
return labels[0] if labels else DEFAULT_PRESET_VOICE_LABEL
def voice_choice_labels() -> List[str]:
return [c[0] for c in list_voice_choices()]