Add playback speed control for generated voiceovers

Replace Gradio Audio output with an HTML player that supports play/pause and a 0.5x-2.0x speed slider, plus direct /outputs WAV download.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
dekun
2026-06-12 18:53:35 +08:00
parent 2dd642598f
commit 1acba0349c
+141 -57
View File
@@ -5,13 +5,13 @@ Gradio Web 中控:音色锁定 → Whisper 识别 → Gemma4 润色 → ChatTT
from __future__ import annotations from __future__ import annotations
import inspect
import logging import logging
import re import re
import shutil import shutil
import sys import sys
import uuid import uuid
from pathlib import Path from pathlib import Path
from urllib.parse import quote
import gradio as gr import gradio as gr
@@ -20,6 +20,7 @@ from config import (
HOST, HOST,
MODEL_NAME, MODEL_NAME,
OLLAMA_URL, OLLAMA_URL,
OUTPUT_DIR,
PORT, PORT,
SPEAKER_EMB_PATH, SPEAKER_EMB_PATH,
UPLOAD_DIR, UPLOAD_DIR,
@@ -60,35 +61,56 @@ def ui_history_dropdown(select_path: str | None = None) -> dict:
return gr.update(choices=choices, value=value) return gr.update(choices=choices, value=value)
def ui_history_play(filepath: str | None) -> dict: def _voice_player_html(wav_path: str | None) -> str:
"""带播放控件与语速滑块的 HTML 播放器(语速仅影响试听,不改变 WAV 文件)。"""
if not wav_path:
return (
'<div class="tts-player-wrap tts-player-empty">'
"<p>合成完成后可在此试听,拖动下方滑块调节播放语速(0.5x~2.0x)。</p>"
"</div>"
)
path = Path(wav_path)
if not path.is_file():
return (
'<div class="tts-player-wrap tts-player-empty">'
"<p>音频文件不存在,请重新合成或刷新历史列表。</p>"
"</div>"
)
name = path.name
src = f"/outputs/{quote(name)}"
return f"""
<div class="tts-player-wrap">
<div class="tts-player-title">🎧 {name}</div>
<audio class="tts-audio-el" controls preload="metadata" src="{src}"></audio>
<div class="tts-speed-row">
<span class="tts-speed-label-text">播放语速</span>
<input type="range" class="tts-speed-slider" min="0.5" max="2.0" step="0.05" value="1"
aria-label="播放语速"
oninput="(function(el){{var w=el.closest('.tts-player-wrap'); var a=w&&w.querySelector('audio'); if(a){{a.playbackRate=parseFloat(el.value);}} var s=w&&w.querySelector('.tts-speed-val'); if(s){{s.textContent=parseFloat(el.value).toFixed(2)+'x';}}}})(this)">
<span class="tts-speed-val">1.00x</span>
<a class="tts-dl-btn" href="{src}" download="{name}">⬇ 下载 WAV</a>
</div>
<p class="tts-player-tip">语速仅用于试听,下载的 WAV 仍为原速。</p>
</div>
"""
def ui_history_play(filepath: str | None) -> str:
"""选中历史条目后加载播放器。""" """选中历史条目后加载播放器。"""
if filepath and Path(filepath).is_file(): return _voice_player_html(filepath)
return gr.update(value=filepath)
return gr.update(value=None)
def ui_initial_history() -> tuple[dict, dict]: def ui_initial_history() -> tuple[dict, str]:
"""首屏加载历史列表并自动选中最新一条。""" """首屏加载历史列表并自动选中最新一条。"""
choices = list_voice_history() choices = list_voice_history()
paths = [p for _, p in choices] paths = [p for _, p in choices]
latest = paths[0] if paths else None latest = paths[0] if paths else None
return gr.update(choices=choices, value=latest), ui_history_play(latest) return gr.update(choices=choices, value=latest), _voice_player_html(latest)
def _tts_output_audio(label: str) -> gr.Audio: def _voice_player_block() -> gr.HTML:
"""成品播放器:兼容 Gradio 4.x(无 show_download_button 等参数)""" """创建成品配音 HTML 播放器区域"""
kwargs: dict = { return gr.HTML(value=_voice_player_html(None), elem_classes=["tts-player-block"])
"label": label,
"type": "filepath",
"interactive": False,
"elem_classes": ["tts-output-audio"],
}
params = inspect.signature(gr.Audio.__init__).parameters
if "show_download_button" in params:
kwargs["show_download_button"] = True
if "show_share_button" in params:
kwargs["show_share_button"] = False
return gr.Audio(**kwargs)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -181,8 +203,8 @@ def _short_synth_log(msg: str, ok: bool) -> str:
segs = re.search(r"\s*(\d+)\s*段", msg) segs = re.search(r"\s*(\d+)\s*段", msg)
if chars: if chars:
seg_note = f"{segs.group(1)} 段拼接" if segs else "" seg_note = f"{segs.group(1)} 段拼接" if segs else ""
return f"✅ 配音完成({chars.group(1)}{seg_note})。请用下方播放器试听、下载。" return f"✅ 配音完成({chars.group(1)}{seg_note})。请用下方播放器试听、调节语速或下载。"
return "✅ 配音完成。请用下方播放器试听、下载。" return "✅ 配音完成。请用下方播放器试听、调节语速或下载。"
def ui_synth_pending(polished_text: str) -> str: def ui_synth_pending(polished_text: str) -> str:
@@ -196,14 +218,14 @@ def ui_synth_pending(polished_text: str) -> str:
) )
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]: def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, str, dict, str]:
"""【TTS 合成】生成最终 wav 配音文件。""" """【TTS 合成】生成最终 wav 配音文件。"""
if not polished_text or not polished_text.strip(): if not polished_text or not polished_text.strip():
return ( return (
"请先完成 Gemma4 润色。", "请先完成 Gemma4 润色。",
gr.update(value=None), _voice_player_html(None),
ui_history_dropdown(), ui_history_dropdown(),
gr.update(value=None), _voice_player_html(None),
) )
voice_id = label_to_voice_id(voice_label) voice_id = label_to_voice_id(voice_label)
@@ -211,15 +233,15 @@ def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict
if ok: if ok:
return ( return (
_short_synth_log(msg, ok), _short_synth_log(msg, ok),
gr.update(value=wav_path), _voice_player_html(wav_path),
ui_history_dropdown(wav_path), ui_history_dropdown(wav_path),
gr.update(value=wav_path), _voice_player_html(wav_path),
) )
return ( return (
_short_synth_log(msg, ok), _short_synth_log(msg, ok),
gr.update(value=None), _voice_player_html(None),
ui_history_dropdown(), ui_history_dropdown(),
gr.update(value=None), _voice_player_html(None),
) )
@@ -231,7 +253,7 @@ def ui_full_pipeline(
skip_polish: bool, skip_polish: bool,
manual_raw: str, manual_raw: str,
voice_label: str, voice_label: str,
) -> tuple[str, str, dict, str, dict, dict]: ) -> tuple[str, str, str, str, dict, str]:
""" """
串联执行:识别 → 润色(可跳过)→ 合成。 串联执行:识别 → 润色(可跳过)→ 合成。
返回 (raw, polished, wav_path, log) 返回 (raw, polished, wav_path, log)
@@ -245,10 +267,10 @@ def ui_full_pipeline(
else: else:
path = _save_upload(audio_file) path = _save_upload(audio_file)
if not path: if not path:
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None) return "", "", _voice_player_html(None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), _voice_player_html(None)
ok, result = transcribe_audio(path) ok, result = transcribe_audio(path)
if not ok: if not ok:
return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None) return "", "", _voice_player_html(None), f"❌ 识别失败: {result}", ui_history_dropdown(), _voice_player_html(None)
raw = result raw = result
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。") logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
@@ -259,7 +281,7 @@ def ui_full_pipeline(
else: else:
ok, result = polish_text(raw) ok, result = polish_text(raw)
if not ok: if not ok:
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None) return raw, "", _voice_player_html(None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
polished = result polished = result
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。") logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
@@ -267,16 +289,16 @@ def ui_full_pipeline(
voice_id = label_to_voice_id(voice_label) voice_id = label_to_voice_id(voice_label)
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id) ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
if not ok: if not ok:
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None) return raw, polished, _voice_player_html(None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
logs.append(f"{msg}") logs.append(f"{msg}")
return ( return (
raw, raw,
polished, polished,
gr.update(value=wav_path), _voice_player_html(wav_path),
"\n".join(logs), "\n".join(logs),
ui_history_dropdown(wav_path), ui_history_dropdown(wav_path),
gr.update(value=wav_path), _voice_player_html(wav_path),
) )
@@ -971,24 +993,68 @@ gradio-app,
.gradio-container .waveform-container { .gradio-container .waveform-container {
background: #1a2332 !important; background: #1a2332 !important;
} }
/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */ /* 成品 HTML 播放器:播放 + 语速滑块 */
.gradio-container .tts-output-audio, .tts-player-block { contain: layout style paint; }
.gradio-container .tts-output-audio .audio-container { .tts-player-wrap {
background: #1a2332 !important;
border: 1px solid #374151 !important; border: 1px solid #374151 !important;
background: #1a2332 !important; border-radius: 10px !important;
contain: strict; padding: 14px 16px !important;
min-height: 120px; margin: 8px 0 !important;
} }
.gradio-container .tts-output-audio button, .tts-player-wrap.tts-player-empty {
.gradio-container .tts-output-audio button:focus, color: #94a3b8 !important;
.gradio-container .tts-output-audio button:focus-visible { font-size: 0.92rem !important;
outline: none !important; min-height: 72px;
box-shadow: none !important;
border-color: #4b5563 !important;
} }
.gradio-container .tts-output-audio .wrap, .tts-player-title {
.gradio-container .tts-output-audio .controls { color: #93c5fd !important;
background: #1a2332 !important; font-size: 0.88rem !important;
margin-bottom: 10px !important;
word-break: break-all;
}
.tts-player-wrap audio.tts-audio-el {
width: 100% !important;
height: 44px !important;
margin: 6px 0 12px 0 !important;
border-radius: 6px !important;
}
.tts-speed-row {
display: flex !important;
flex-wrap: wrap !important;
align-items: center !important;
gap: 10px 14px !important;
margin-top: 4px !important;
}
.tts-speed-label-text {
color: #e5e7eb !important;
font-size: 0.9rem !important;
min-width: 64px;
}
.tts-speed-slider {
flex: 1 1 120px !important;
min-width: 120px !important;
max-width: 280px !important;
accent-color: #2563eb !important;
}
.tts-speed-val {
color: #93c5fd !important;
font-weight: 600 !important;
min-width: 48px !important;
}
.tts-dl-btn {
color: #ffffff !important;
background: #374151 !important;
padding: 6px 12px !important;
border-radius: 6px !important;
text-decoration: none !important;
font-size: 0.85rem !important;
}
.tts-dl-btn:hover { background: #4b5563 !important; }
.tts-player-tip {
color: #64748b !important;
font-size: 0.78rem !important;
margin: 10px 0 0 0 !important;
} }
.gradio-container .pipeline-step-card textarea { .gradio-container .pipeline-step-card textarea {
contain: layout style; contain: layout style;
@@ -1124,7 +1190,7 @@ def build_app() -> gr.Blocks:
with gr.Row(elem_classes=["pipeline-output-row"]): with gr.Row(elem_classes=["pipeline-output-row"]):
pipe_raw = gr.Textbox(label="转写原文", lines=6) pipe_raw = gr.Textbox(label="转写原文", lines=6)
pipe_polished = gr.Textbox(label="润色稿", lines=6) pipe_polished = gr.Textbox(label="润色稿", lines=6)
pipe_output = _tts_output_audio("成品配音") pipe_player = _voice_player_block()
# ---- Tab 2: 分步流水线 ---- # ---- Tab 2: 分步流水线 ----
with gr.Tab("分步流水线"): with gr.Tab("分步流水线"):
@@ -1171,7 +1237,7 @@ def build_app() -> gr.Blocks:
) )
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary") synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False) synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
output_audio = _tts_output_audio("成品配音") output_player = _voice_player_block()
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log]) transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log]) polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
@@ -1262,7 +1328,7 @@ def build_app() -> gr.Blocks:
scale=4, scale=4,
) )
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100) history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
history_player = _tts_output_audio("历史试听 / 下载") history_player = _voice_player_block()
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select]) history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
history_select.change(ui_history_play, history_select, history_player) history_select.change(ui_history_play, history_select, history_player)
@@ -1283,7 +1349,7 @@ def build_app() -> gr.Blocks:
).then( ).then(
ui_full_pipeline, ui_full_pipeline,
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice], [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
[pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player], [pipe_raw, pipe_polished, pipe_player, pipeline_log, history_select, history_player],
queue=True, queue=True,
) )
synth_btn.click( synth_btn.click(
@@ -1294,7 +1360,7 @@ def build_app() -> gr.Blocks:
).then( ).then(
ui_synthesize, ui_synthesize,
[polished_text, tts_voice], [polished_text, tts_voice],
[synth_log, output_audio, history_select, history_player], [synth_log, output_player, history_select, history_player],
queue=True, queue=True,
) )
@@ -1335,6 +1401,24 @@ def create_fastapi_app():
headers={"Service-Worker-Allowed": "/"}, headers={"Service-Worker-Allowed": "/"},
) )
@fastapi_app.get("/outputs/{filename}")
async def serve_output_wav(filename: str):
"""供 HTML 播放器直接加载 outputs 下的配音文件。"""
from fastapi import HTTPException
if (
".." in filename
or "/" in filename
or "\\" in filename
or not filename.startswith("voiceover_")
or not filename.endswith(".wav")
):
raise HTTPException(status_code=404)
path = OUTPUT_DIR / filename
if not path.is_file():
raise HTTPException(status_code=404)
return FileResponse(path, media_type="audio/wav", filename=filename)
blocks = build_app() blocks = build_app()
gr.mount_gradio_app( gr.mount_gradio_app(
fastapi_app, fastapi_app,