Add playback speed control for generated voiceovers

Replace Gradio Audio output with an HTML player that supports play/pause and a 0.5x-2.0x speed slider, plus direct /outputs WAV download.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
dekun
2026-06-12 18:53:35 +08:00
parent 2dd642598f
commit 1acba0349c
+141 -57
View File
@@ -5,13 +5,13 @@ Gradio Web 中控:音色锁定 → Whisper 识别 → Gemma4 润色 → ChatTT
from __future__ import annotations
import inspect
import logging
import re
import shutil
import sys
import uuid
from pathlib import Path
from urllib.parse import quote
import gradio as gr
@@ -20,6 +20,7 @@ from config import (
HOST,
MODEL_NAME,
OLLAMA_URL,
OUTPUT_DIR,
PORT,
SPEAKER_EMB_PATH,
UPLOAD_DIR,
@@ -60,35 +61,56 @@ def ui_history_dropdown(select_path: str | None = None) -> dict:
return gr.update(choices=choices, value=value)
def ui_history_play(filepath: str | None) -> dict:
def _voice_player_html(wav_path: str | None) -> str:
"""带播放控件与语速滑块的 HTML 播放器(语速仅影响试听,不改变 WAV 文件)。"""
if not wav_path:
return (
'<div class="tts-player-wrap tts-player-empty">'
"<p>合成完成后可在此试听,拖动下方滑块调节播放语速(0.5x~2.0x)。</p>"
"</div>"
)
path = Path(wav_path)
if not path.is_file():
return (
'<div class="tts-player-wrap tts-player-empty">'
"<p>音频文件不存在,请重新合成或刷新历史列表。</p>"
"</div>"
)
name = path.name
src = f"/outputs/{quote(name)}"
return f"""
<div class="tts-player-wrap">
<div class="tts-player-title">🎧 {name}</div>
<audio class="tts-audio-el" controls preload="metadata" src="{src}"></audio>
<div class="tts-speed-row">
<span class="tts-speed-label-text">播放语速</span>
<input type="range" class="tts-speed-slider" min="0.5" max="2.0" step="0.05" value="1"
aria-label="播放语速"
oninput="(function(el){{var w=el.closest('.tts-player-wrap'); var a=w&&w.querySelector('audio'); if(a){{a.playbackRate=parseFloat(el.value);}} var s=w&&w.querySelector('.tts-speed-val'); if(s){{s.textContent=parseFloat(el.value).toFixed(2)+'x';}}}})(this)">
<span class="tts-speed-val">1.00x</span>
<a class="tts-dl-btn" href="{src}" download="{name}">⬇ 下载 WAV</a>
</div>
<p class="tts-player-tip">语速仅用于试听,下载的 WAV 仍为原速。</p>
</div>
"""
def ui_history_play(filepath: str | None) -> str:
"""选中历史条目后加载播放器。"""
if filepath and Path(filepath).is_file():
return gr.update(value=filepath)
return gr.update(value=None)
return _voice_player_html(filepath)
def ui_initial_history() -> tuple[dict, dict]:
def ui_initial_history() -> tuple[dict, str]:
"""首屏加载历史列表并自动选中最新一条。"""
choices = list_voice_history()
paths = [p for _, p in choices]
latest = paths[0] if paths else None
return gr.update(choices=choices, value=latest), ui_history_play(latest)
return gr.update(choices=choices, value=latest), _voice_player_html(latest)
def _tts_output_audio(label: str) -> gr.Audio:
"""成品播放器:兼容 Gradio 4.x(无 show_download_button 等参数)"""
kwargs: dict = {
"label": label,
"type": "filepath",
"interactive": False,
"elem_classes": ["tts-output-audio"],
}
params = inspect.signature(gr.Audio.__init__).parameters
if "show_download_button" in params:
kwargs["show_download_button"] = True
if "show_share_button" in params:
kwargs["show_share_button"] = False
return gr.Audio(**kwargs)
def _voice_player_block() -> gr.HTML:
"""创建成品配音 HTML 播放器区域"""
return gr.HTML(value=_voice_player_html(None), elem_classes=["tts-player-block"])
# ---------------------------------------------------------------------------
@@ -181,8 +203,8 @@ def _short_synth_log(msg: str, ok: bool) -> str:
segs = re.search(r"\s*(\d+)\s*段", msg)
if chars:
seg_note = f"{segs.group(1)} 段拼接" if segs else ""
return f"✅ 配音完成({chars.group(1)}{seg_note})。请用下方播放器试听、下载。"
return "✅ 配音完成。请用下方播放器试听、下载。"
return f"✅ 配音完成({chars.group(1)}{seg_note})。请用下方播放器试听、调节语速或下载。"
return "✅ 配音完成。请用下方播放器试听、调节语速或下载。"
def ui_synth_pending(polished_text: str) -> str:
@@ -196,14 +218,14 @@ def ui_synth_pending(polished_text: str) -> str:
)
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]:
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, str, dict, str]:
"""【TTS 合成】生成最终 wav 配音文件。"""
if not polished_text or not polished_text.strip():
return (
"请先完成 Gemma4 润色。",
gr.update(value=None),
_voice_player_html(None),
ui_history_dropdown(),
gr.update(value=None),
_voice_player_html(None),
)
voice_id = label_to_voice_id(voice_label)
@@ -211,15 +233,15 @@ def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict
if ok:
return (
_short_synth_log(msg, ok),
gr.update(value=wav_path),
_voice_player_html(wav_path),
ui_history_dropdown(wav_path),
gr.update(value=wav_path),
_voice_player_html(wav_path),
)
return (
_short_synth_log(msg, ok),
gr.update(value=None),
_voice_player_html(None),
ui_history_dropdown(),
gr.update(value=None),
_voice_player_html(None),
)
@@ -231,7 +253,7 @@ def ui_full_pipeline(
skip_polish: bool,
manual_raw: str,
voice_label: str,
) -> tuple[str, str, dict, str, dict, dict]:
) -> tuple[str, str, str, str, dict, str]:
"""
串联执行:识别 → 润色(可跳过)→ 合成。
返回 (raw, polished, wav_path, log)
@@ -245,10 +267,10 @@ def ui_full_pipeline(
else:
path = _save_upload(audio_file)
if not path:
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None)
return "", "", _voice_player_html(None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), _voice_player_html(None)
ok, result = transcribe_audio(path)
if not ok:
return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None)
return "", "", _voice_player_html(None), f"❌ 识别失败: {result}", ui_history_dropdown(), _voice_player_html(None)
raw = result
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
@@ -259,7 +281,7 @@ def ui_full_pipeline(
else:
ok, result = polish_text(raw)
if not ok:
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
return raw, "", _voice_player_html(None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
polished = result
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
@@ -267,16 +289,16 @@ def ui_full_pipeline(
voice_id = label_to_voice_id(voice_label)
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
if not ok:
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
return raw, polished, _voice_player_html(None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
logs.append(f"{msg}")
return (
raw,
polished,
gr.update(value=wav_path),
_voice_player_html(wav_path),
"\n".join(logs),
ui_history_dropdown(wav_path),
gr.update(value=wav_path),
_voice_player_html(wav_path),
)
@@ -971,24 +993,68 @@ gradio-app,
.gradio-container .waveform-container {
background: #1a2332 !important;
}
/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */
.gradio-container .tts-output-audio,
.gradio-container .tts-output-audio .audio-container {
/* 成品 HTML 播放器:播放 + 语速滑块 */
.tts-player-block { contain: layout style paint; }
.tts-player-wrap {
background: #1a2332 !important;
border: 1px solid #374151 !important;
background: #1a2332 !important;
contain: strict;
min-height: 120px;
border-radius: 10px !important;
padding: 14px 16px !important;
margin: 8px 0 !important;
}
.gradio-container .tts-output-audio button,
.gradio-container .tts-output-audio button:focus,
.gradio-container .tts-output-audio button:focus-visible {
outline: none !important;
box-shadow: none !important;
border-color: #4b5563 !important;
.tts-player-wrap.tts-player-empty {
color: #94a3b8 !important;
font-size: 0.92rem !important;
min-height: 72px;
}
.gradio-container .tts-output-audio .wrap,
.gradio-container .tts-output-audio .controls {
background: #1a2332 !important;
.tts-player-title {
color: #93c5fd !important;
font-size: 0.88rem !important;
margin-bottom: 10px !important;
word-break: break-all;
}
.tts-player-wrap audio.tts-audio-el {
width: 100% !important;
height: 44px !important;
margin: 6px 0 12px 0 !important;
border-radius: 6px !important;
}
.tts-speed-row {
display: flex !important;
flex-wrap: wrap !important;
align-items: center !important;
gap: 10px 14px !important;
margin-top: 4px !important;
}
.tts-speed-label-text {
color: #e5e7eb !important;
font-size: 0.9rem !important;
min-width: 64px;
}
.tts-speed-slider {
flex: 1 1 120px !important;
min-width: 120px !important;
max-width: 280px !important;
accent-color: #2563eb !important;
}
.tts-speed-val {
color: #93c5fd !important;
font-weight: 600 !important;
min-width: 48px !important;
}
.tts-dl-btn {
color: #ffffff !important;
background: #374151 !important;
padding: 6px 12px !important;
border-radius: 6px !important;
text-decoration: none !important;
font-size: 0.85rem !important;
}
.tts-dl-btn:hover { background: #4b5563 !important; }
.tts-player-tip {
color: #64748b !important;
font-size: 0.78rem !important;
margin: 10px 0 0 0 !important;
}
.gradio-container .pipeline-step-card textarea {
contain: layout style;
@@ -1124,7 +1190,7 @@ def build_app() -> gr.Blocks:
with gr.Row(elem_classes=["pipeline-output-row"]):
pipe_raw = gr.Textbox(label="转写原文", lines=6)
pipe_polished = gr.Textbox(label="润色稿", lines=6)
pipe_output = _tts_output_audio("成品配音")
pipe_player = _voice_player_block()
# ---- Tab 2: 分步流水线 ----
with gr.Tab("分步流水线"):
@@ -1171,7 +1237,7 @@ def build_app() -> gr.Blocks:
)
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
output_audio = _tts_output_audio("成品配音")
output_player = _voice_player_block()
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
@@ -1262,7 +1328,7 @@ def build_app() -> gr.Blocks:
scale=4,
)
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
history_player = _tts_output_audio("历史试听 / 下载")
history_player = _voice_player_block()
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
history_select.change(ui_history_play, history_select, history_player)
@@ -1283,7 +1349,7 @@ def build_app() -> gr.Blocks:
).then(
ui_full_pipeline,
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
[pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player],
[pipe_raw, pipe_polished, pipe_player, pipeline_log, history_select, history_player],
queue=True,
)
synth_btn.click(
@@ -1294,7 +1360,7 @@ def build_app() -> gr.Blocks:
).then(
ui_synthesize,
[polished_text, tts_voice],
[synth_log, output_audio, history_select, history_player],
[synth_log, output_player, history_select, history_player],
queue=True,
)
@@ -1335,6 +1401,24 @@ def create_fastapi_app():
headers={"Service-Worker-Allowed": "/"},
)
@fastapi_app.get("/outputs/{filename}")
async def serve_output_wav(filename: str):
"""供 HTML 播放器直接加载 outputs 下的配音文件。"""
from fastapi import HTTPException
if (
".." in filename
or "/" in filename
or "\\" in filename
or not filename.startswith("voiceover_")
or not filename.endswith(".wav")
):
raise HTTPException(status_code=404)
path = OUTPUT_DIR / filename
if not path.is_file():
raise HTTPException(status_code=404)
return FileResponse(path, media_type="audio/wav", filename=filename)
blocks = build_app()
gr.mount_gradio_app(
fastapi_app,