Add voice history, default preset voice, and one-click tab
Keep synthesized wav files browsable with playback and download, default to preset steady male voice, show one-click pipeline as the first tab, and reduce post-synthesis UI flicker. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -25,7 +25,8 @@ from config import (
|
||||
)
|
||||
from llm_service import check_ollama_health, polish_text
|
||||
from tts_service import generate_voice, save_fixed_speaker, speaker_is_ready
|
||||
from voice_presets import label_to_voice_id, voice_choice_labels
|
||||
from voice_history import list_voice_history
|
||||
from voice_presets import default_voice_label, label_to_voice_id, voice_choice_labels
|
||||
from whisper_service import transcribe_audio
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -42,9 +43,35 @@ logging.basicConfig(
|
||||
logger = logging.getLogger("trading_studio")
|
||||
|
||||
|
||||
def _default_voice_label() -> str:
|
||||
labels = voice_choice_labels()
|
||||
return labels[0] if labels else "我的锁定音色(声音克隆)"
|
||||
# ---------------------------------------------------------------------------
|
||||
# 配音历史
|
||||
# ---------------------------------------------------------------------------
|
||||
def ui_history_dropdown(select_path: str | None = None) -> dict:
|
||||
"""刷新历史下拉列表;可选选中指定路径(合成完成后传入新文件)。"""
|
||||
choices = list_voice_history()
|
||||
paths = [p for _, p in choices]
|
||||
if select_path and select_path in paths:
|
||||
value = select_path
|
||||
elif paths:
|
||||
value = paths[0]
|
||||
else:
|
||||
value = None
|
||||
return gr.update(choices=choices, value=value)
|
||||
|
||||
|
||||
def ui_history_play(filepath: str | None) -> dict:
|
||||
"""选中历史条目后加载播放器。"""
|
||||
if filepath and Path(filepath).is_file():
|
||||
return gr.update(value=filepath)
|
||||
return gr.update(value=None)
|
||||
|
||||
|
||||
def ui_initial_history() -> tuple[dict, dict]:
|
||||
"""首屏加载历史列表并自动选中最新一条。"""
|
||||
choices = list_voice_history()
|
||||
paths = [p for _, p in choices]
|
||||
latest = paths[0] if paths else None
|
||||
return gr.update(choices=choices, value=latest), ui_history_play(latest)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -141,40 +168,42 @@ def _short_synth_log(msg: str, ok: bool) -> str:
|
||||
return "✅ 配音完成。请用下方播放器试听、下载。"
|
||||
|
||||
|
||||
def ui_synth_pending(polished_text: str) -> tuple[str, dict]:
|
||||
"""点击合成后立即反馈,避免长时间无日志更新被误认为卡死。"""
|
||||
def ui_synth_pending(polished_text: str) -> str:
|
||||
"""点击合成后立即更新日志;不触碰播放器,避免波形组件销毁重建导致闪屏。"""
|
||||
text = (polished_text or "").strip()
|
||||
if not text:
|
||||
return "请先完成 Gemma4 润色。", gr.update(value=None)
|
||||
return "请先完成 Gemma4 润色。"
|
||||
est_sec = max(20, len(text) // 10)
|
||||
return (
|
||||
f"⏳ 配音合成中(约 {len(text)} 字,预计 {est_sec}–{est_sec + 45} 秒),请勿重复点击…",
|
||||
gr.update(value=None),
|
||||
f"⏳ 配音合成中(约 {len(text)} 字,预计 {est_sec}–{est_sec + 45} 秒),请勿重复点击或刷新页面…"
|
||||
)
|
||||
|
||||
|
||||
def ui_synthesize(
|
||||
polished_text: str,
|
||||
voice_label: str,
|
||||
progress: gr.Progress = gr.Progress(),
|
||||
) -> tuple[str, dict]:
|
||||
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]:
|
||||
"""【TTS 合成】生成最终 wav 配音文件。"""
|
||||
if not polished_text or not polished_text.strip():
|
||||
return "请先完成 Gemma4 润色。", gr.update(value=None)
|
||||
return (
|
||||
"请先完成 Gemma4 润色。",
|
||||
gr.update(value=None),
|
||||
ui_history_dropdown(),
|
||||
gr.update(value=None),
|
||||
)
|
||||
|
||||
voice_id = label_to_voice_id(voice_label)
|
||||
|
||||
def _report_segment(seg: int, total: int) -> None:
|
||||
progress((seg - 1) / max(total, 1), desc=f"ChatTTS 第 {seg}/{total} 段…")
|
||||
|
||||
ok, msg, wav_path = generate_voice(
|
||||
polished_text,
|
||||
voice_id=voice_id,
|
||||
progress_callback=_report_segment,
|
||||
)
|
||||
ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id)
|
||||
if ok:
|
||||
return _short_synth_log(msg, ok), gr.update(value=wav_path)
|
||||
return _short_synth_log(msg, ok), gr.update(value=None)
|
||||
return (
|
||||
_short_synth_log(msg, ok),
|
||||
gr.update(value=wav_path),
|
||||
ui_history_dropdown(wav_path),
|
||||
gr.update(value=wav_path),
|
||||
)
|
||||
return (
|
||||
_short_synth_log(msg, ok),
|
||||
gr.update(value=None),
|
||||
ui_history_dropdown(),
|
||||
gr.update(value=None),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -185,7 +214,7 @@ def ui_full_pipeline(
|
||||
skip_polish: bool,
|
||||
manual_raw: str,
|
||||
voice_label: str,
|
||||
) -> tuple[str, str, str | None, str]:
|
||||
) -> tuple[str, str, dict, str, dict, dict]:
|
||||
"""
|
||||
串联执行:识别 → 润色(可跳过)→ 合成。
|
||||
返回 (raw, polished, wav_path, log)
|
||||
@@ -199,10 +228,10 @@ def ui_full_pipeline(
|
||||
else:
|
||||
path = _save_upload(audio_file)
|
||||
if not path:
|
||||
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。"
|
||||
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None)
|
||||
ok, result = transcribe_audio(path)
|
||||
if not ok:
|
||||
return "", "", gr.update(value=None), f"❌ 识别失败: {result}"
|
||||
return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None)
|
||||
raw = result
|
||||
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
|
||||
|
||||
@@ -213,7 +242,7 @@ def ui_full_pipeline(
|
||||
else:
|
||||
ok, result = polish_text(raw)
|
||||
if not ok:
|
||||
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs)
|
||||
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
|
||||
polished = result
|
||||
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
|
||||
|
||||
@@ -221,10 +250,17 @@ def ui_full_pipeline(
|
||||
voice_id = label_to_voice_id(voice_label)
|
||||
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
|
||||
if not ok:
|
||||
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs)
|
||||
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
|
||||
|
||||
logs.append(f"✅ {msg}")
|
||||
return raw, polished, gr.update(value=wav_path), "\n".join(logs)
|
||||
return (
|
||||
raw,
|
||||
polished,
|
||||
gr.update(value=wav_path),
|
||||
"\n".join(logs),
|
||||
ui_history_dropdown(wav_path),
|
||||
gr.update(value=wav_path),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -918,6 +954,25 @@ gradio-app,
|
||||
.gradio-container .waveform-container {
|
||||
background: #1a2332 !important;
|
||||
}
|
||||
/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */
|
||||
.gradio-container .tts-output-audio,
|
||||
.gradio-container .tts-output-audio .audio-container {
|
||||
border: 1px solid #374151 !important;
|
||||
background: #1a2332 !important;
|
||||
contain: strict;
|
||||
min-height: 120px;
|
||||
}
|
||||
.gradio-container .tts-output-audio button,
|
||||
.gradio-container .tts-output-audio button:focus,
|
||||
.gradio-container .tts-output-audio button:focus-visible {
|
||||
outline: none !important;
|
||||
box-shadow: none !important;
|
||||
border-color: #4b5563 !important;
|
||||
}
|
||||
.gradio-container .tts-output-audio .wrap,
|
||||
.gradio-container .tts-output-audio .controls {
|
||||
background: #1a2332 !important;
|
||||
}
|
||||
.gradio-container .pipeline-step-card textarea {
|
||||
contain: layout style;
|
||||
}
|
||||
@@ -1061,38 +1116,48 @@ def build_app() -> gr.Blocks:
|
||||
)
|
||||
|
||||
with gr.Tabs():
|
||||
# ---- Tab 1: 音色锁定 ----
|
||||
with gr.Tab("🎙️ 音色锁定"):
|
||||
# ---- Tab 1: 一键生产(默认首页)----
|
||||
with gr.Tab("🚀 一键生产"):
|
||||
gr.HTML(MIC_HINT_HTML)
|
||||
gr.HTML(
|
||||
f'<div class="hint-box">'
|
||||
f'上传 <strong>10-30 秒</strong> 干净人声样本,系统将提取 Speaker Embedding '
|
||||
f'并保存至 <span class="file-tag">{SPEAKER_EMB_PATH.name}</span>,'
|
||||
f'后续合成 <strong>100% 还原音色</strong>。'
|
||||
f"</div>"
|
||||
gr.Markdown(
|
||||
"上传碎碎念录音,系统自动完成 **识别 → 润色 → 合成** 全流程。"
|
||||
)
|
||||
with gr.Row():
|
||||
spk_audio = gr.Audio(
|
||||
label="参考人声(碎碎念盲录样本)",
|
||||
pipe_audio = gr.Audio(
|
||||
label="复盘录音",
|
||||
type="filepath",
|
||||
sources=["upload", "microphone"],
|
||||
)
|
||||
spk_transcript = gr.Textbox(
|
||||
label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错)",
|
||||
placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…",
|
||||
info="请尽量与参考音频内容完全一致,可提升音色还原度",
|
||||
lines=6,
|
||||
elem_classes=["bright-input"],
|
||||
pipe_manual = gr.Textbox(
|
||||
label="或手动输入转写(跳过识别)",
|
||||
lines=4,
|
||||
placeholder="若已有转写文本,可直接粘贴,留空则走 Whisper 识别",
|
||||
)
|
||||
lock_btn = gr.Button("🔒 锁定音色", variant="primary")
|
||||
lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False)
|
||||
lock_btn.click(
|
||||
ui_lock_speaker,
|
||||
[spk_audio, spk_transcript],
|
||||
[lock_log, speaker_status],
|
||||
skip_polish_cb = gr.Checkbox(
|
||||
label="跳过 Gemma4 润色(仅测试 TTS)",
|
||||
value=False,
|
||||
)
|
||||
pipe_voice = gr.Radio(
|
||||
label="配音音色(本地 ChatTTS)",
|
||||
choices=voice_choice_labels(),
|
||||
value=default_voice_label(),
|
||||
elem_classes=["voice-radio"],
|
||||
)
|
||||
pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
|
||||
pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
|
||||
with gr.Row(elem_classes=["pipeline-output-row"]):
|
||||
pipe_raw = gr.Textbox(label="转写原文", lines=6)
|
||||
pipe_polished = gr.Textbox(label="润色稿", lines=6)
|
||||
pipe_output = gr.Audio(
|
||||
label="成品配音",
|
||||
type="filepath",
|
||||
interactive=False,
|
||||
show_download_button=True,
|
||||
show_share_button=False,
|
||||
elem_classes=["tts-output-audio"],
|
||||
)
|
||||
|
||||
# ---- Tab 2: 分步操作(纵向三步,避免三栏挤在一起)----
|
||||
# ---- Tab 2: 分步流水线 ----
|
||||
with gr.Tab("🔧 分步流水线"):
|
||||
gr.HTML(MIC_HINT_HTML)
|
||||
with gr.Column(elem_classes=["pipeline-flow"]):
|
||||
@@ -1126,7 +1191,7 @@ def build_app() -> gr.Blocks:
|
||||
tts_voice = gr.Radio(
|
||||
label="配音音色(本地 ChatTTS)",
|
||||
choices=voice_choice_labels(),
|
||||
value=_default_voice_label(),
|
||||
value=default_voice_label(),
|
||||
info="预设音色:bash scripts/generate_voice_presets.sh",
|
||||
elem_classes=["voice-radio"],
|
||||
)
|
||||
@@ -1137,74 +1202,101 @@ def build_app() -> gr.Blocks:
|
||||
)
|
||||
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
|
||||
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
|
||||
output_audio = gr.Audio(label="成品配音", type="filepath")
|
||||
output_audio = gr.Audio(
|
||||
label="成品配音",
|
||||
type="filepath",
|
||||
interactive=False,
|
||||
show_download_button=True,
|
||||
show_share_button=False,
|
||||
elem_classes=["tts-output-audio"],
|
||||
)
|
||||
|
||||
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
|
||||
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
|
||||
synth_btn.click(
|
||||
ui_synth_pending,
|
||||
[polished_text],
|
||||
[synth_log, output_audio],
|
||||
queue=True,
|
||||
).then(
|
||||
ui_synthesize,
|
||||
[polished_text, tts_voice],
|
||||
[synth_log, output_audio],
|
||||
queue=True,
|
||||
)
|
||||
|
||||
# ---- Tab 3: 一键生产 ----
|
||||
with gr.Tab("🚀 一键生产"):
|
||||
# ---- Tab 3: 音色锁定 ----
|
||||
with gr.Tab("🎙️ 音色锁定"):
|
||||
gr.HTML(MIC_HINT_HTML)
|
||||
gr.Markdown(
|
||||
"上传碎碎念录音,系统自动完成 **识别 → 润色 → 合成** 全流程。"
|
||||
gr.HTML(
|
||||
f'<div class="hint-box">'
|
||||
f'上传 <strong>10-30 秒</strong> 干净人声样本,系统将提取 Speaker Embedding '
|
||||
f'并保存至 <span class="file-tag">{SPEAKER_EMB_PATH.name}</span>,'
|
||||
f'后续合成 <strong>100% 还原音色</strong>。'
|
||||
f"</div>"
|
||||
)
|
||||
with gr.Row():
|
||||
pipe_audio = gr.Audio(
|
||||
label="复盘录音",
|
||||
spk_audio = gr.Audio(
|
||||
label="参考人声(碎碎念盲录样本)",
|
||||
type="filepath",
|
||||
sources=["upload", "microphone"],
|
||||
)
|
||||
pipe_manual = gr.Textbox(
|
||||
label="或手动输入转写(跳过识别)",
|
||||
lines=4,
|
||||
placeholder="若已有转写文本,可直接粘贴,留空则走 Whisper 识别",
|
||||
spk_transcript = gr.Textbox(
|
||||
label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错)",
|
||||
placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…",
|
||||
info="请尽量与参考音频内容完全一致,可提升音色还原度",
|
||||
lines=6,
|
||||
elem_classes=["bright-input"],
|
||||
)
|
||||
skip_polish_cb = gr.Checkbox(
|
||||
label="跳过 Gemma4 润色(仅测试 TTS)",
|
||||
value=False,
|
||||
lock_btn = gr.Button("🔒 锁定音色", variant="primary")
|
||||
lock_log = gr.Textbox(label="锁定结果", lines=4, interactive=False)
|
||||
lock_btn.click(
|
||||
ui_lock_speaker,
|
||||
[spk_audio, spk_transcript],
|
||||
[lock_log, speaker_status],
|
||||
)
|
||||
pipe_voice = gr.Radio(
|
||||
label="配音音色(本地 ChatTTS)",
|
||||
choices=voice_choice_labels(),
|
||||
value=_default_voice_label(),
|
||||
elem_classes=["voice-radio"],
|
||||
)
|
||||
pipeline_btn = gr.Button("▶ 启动全流程", variant="primary", size="lg")
|
||||
pipeline_log = gr.Textbox(label="流水线日志", lines=6, interactive=False)
|
||||
with gr.Row(elem_classes=["pipeline-output-row"]):
|
||||
pipe_raw = gr.Textbox(label="转写原文", lines=6)
|
||||
pipe_polished = gr.Textbox(label="润色稿", lines=6)
|
||||
pipe_output = gr.Audio(label="成品配音", type="filepath")
|
||||
|
||||
def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> tuple[str, dict]:
|
||||
if manual_raw and manual_raw.strip():
|
||||
return "⏳ 全流程运行中(识别/润色/合成),请稍候…", gr.update(value=None)
|
||||
if skip_polish:
|
||||
return "⏳ 全流程运行中(识别→合成),请稍候…", gr.update(value=None)
|
||||
return "⏳ 全流程运行中(识别→润色→合成),请稍候…", gr.update(value=None)
|
||||
|
||||
pipeline_btn.click(
|
||||
ui_pipeline_pending,
|
||||
[skip_polish_cb, pipe_manual],
|
||||
[pipeline_log, pipe_output],
|
||||
queue=True,
|
||||
).then(
|
||||
ui_full_pipeline,
|
||||
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
|
||||
[pipe_raw, pipe_polished, pipe_output, pipeline_log],
|
||||
queue=True,
|
||||
with gr.Accordion("📂 配音历史(本地保留,可随时试听下载)", open=True):
|
||||
with gr.Row():
|
||||
history_select = gr.Dropdown(
|
||||
label="历史配音",
|
||||
choices=list_voice_history(),
|
||||
value=None,
|
||||
interactive=True,
|
||||
scale=4,
|
||||
)
|
||||
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
|
||||
history_player = gr.Audio(
|
||||
label="历史试听 / 下载",
|
||||
type="filepath",
|
||||
interactive=False,
|
||||
show_download_button=True,
|
||||
show_share_button=False,
|
||||
elem_classes=["tts-output-audio"],
|
||||
)
|
||||
|
||||
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
|
||||
history_select.change(ui_history_play, history_select, history_player)
|
||||
demo.load(ui_initial_history, outputs=[history_select, history_player])
|
||||
|
||||
def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> str:
|
||||
if manual_raw and manual_raw.strip():
|
||||
return "⏳ 全流程运行中(识别/润色/合成),请稍候,勿刷新页面…"
|
||||
if skip_polish:
|
||||
return "⏳ 全流程运行中(识别→合成),请稍候,勿刷新页面…"
|
||||
return "⏳ 全流程运行中(识别→润色→合成),请稍候,勿刷新页面…"
|
||||
|
||||
pipeline_btn.click(
|
||||
ui_pipeline_pending,
|
||||
[skip_polish_cb, pipe_manual],
|
||||
[pipeline_log],
|
||||
queue=True,
|
||||
).then(
|
||||
ui_full_pipeline,
|
||||
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
|
||||
[pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player],
|
||||
queue=True,
|
||||
)
|
||||
synth_btn.click(
|
||||
ui_synth_pending,
|
||||
[polished_text],
|
||||
[synth_log],
|
||||
queue=True,
|
||||
).then(
|
||||
ui_synthesize,
|
||||
[polished_text, tts_voice],
|
||||
[synth_log, output_audio, history_select, history_player],
|
||||
queue=True,
|
||||
)
|
||||
|
||||
demo.queue(default_concurrency_limit=1)
|
||||
return demo
|
||||
|
||||
Reference in New Issue
Block a user