Add playback speed control for generated voiceovers
Replace Gradio Audio output with an HTML player that supports play/pause and a 0.5x-2.0x speed slider, plus direct /outputs WAV download. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -5,13 +5,13 @@ Gradio Web 中控:音色锁定 → Whisper 识别 → Gemma4 润色 → ChatTT
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
import gradio as gr
|
||||
|
||||
@@ -20,6 +20,7 @@ from config import (
|
||||
HOST,
|
||||
MODEL_NAME,
|
||||
OLLAMA_URL,
|
||||
OUTPUT_DIR,
|
||||
PORT,
|
||||
SPEAKER_EMB_PATH,
|
||||
UPLOAD_DIR,
|
||||
@@ -60,35 +61,56 @@ def ui_history_dropdown(select_path: str | None = None) -> dict:
|
||||
return gr.update(choices=choices, value=value)
|
||||
|
||||
|
||||
def ui_history_play(filepath: str | None) -> dict:
|
||||
def _voice_player_html(wav_path: str | None) -> str:
|
||||
"""带播放控件与语速滑块的 HTML 播放器(语速仅影响试听,不改变 WAV 文件)。"""
|
||||
if not wav_path:
|
||||
return (
|
||||
'<div class="tts-player-wrap tts-player-empty">'
|
||||
"<p>合成完成后可在此试听,拖动下方滑块调节播放语速(0.5x~2.0x)。</p>"
|
||||
"</div>"
|
||||
)
|
||||
path = Path(wav_path)
|
||||
if not path.is_file():
|
||||
return (
|
||||
'<div class="tts-player-wrap tts-player-empty">'
|
||||
"<p>音频文件不存在,请重新合成或刷新历史列表。</p>"
|
||||
"</div>"
|
||||
)
|
||||
name = path.name
|
||||
src = f"/outputs/{quote(name)}"
|
||||
return f"""
|
||||
<div class="tts-player-wrap">
|
||||
<div class="tts-player-title">🎧 {name}</div>
|
||||
<audio class="tts-audio-el" controls preload="metadata" src="{src}"></audio>
|
||||
<div class="tts-speed-row">
|
||||
<span class="tts-speed-label-text">播放语速</span>
|
||||
<input type="range" class="tts-speed-slider" min="0.5" max="2.0" step="0.05" value="1"
|
||||
aria-label="播放语速"
|
||||
oninput="(function(el){{var w=el.closest('.tts-player-wrap'); var a=w&&w.querySelector('audio'); if(a){{a.playbackRate=parseFloat(el.value);}} var s=w&&w.querySelector('.tts-speed-val'); if(s){{s.textContent=parseFloat(el.value).toFixed(2)+'x';}}}})(this)">
|
||||
<span class="tts-speed-val">1.00x</span>
|
||||
<a class="tts-dl-btn" href="{src}" download="{name}">⬇ 下载 WAV</a>
|
||||
</div>
|
||||
<p class="tts-player-tip">语速仅用于试听,下载的 WAV 仍为原速。</p>
|
||||
</div>
|
||||
"""
|
||||
|
||||
|
||||
def ui_history_play(filepath: str | None) -> str:
|
||||
"""选中历史条目后加载播放器。"""
|
||||
if filepath and Path(filepath).is_file():
|
||||
return gr.update(value=filepath)
|
||||
return gr.update(value=None)
|
||||
return _voice_player_html(filepath)
|
||||
|
||||
|
||||
def ui_initial_history() -> tuple[dict, dict]:
|
||||
def ui_initial_history() -> tuple[dict, str]:
|
||||
"""首屏加载历史列表并自动选中最新一条。"""
|
||||
choices = list_voice_history()
|
||||
paths = [p for _, p in choices]
|
||||
latest = paths[0] if paths else None
|
||||
return gr.update(choices=choices, value=latest), ui_history_play(latest)
|
||||
return gr.update(choices=choices, value=latest), _voice_player_html(latest)
|
||||
|
||||
|
||||
def _tts_output_audio(label: str) -> gr.Audio:
|
||||
"""成品播放器:兼容 Gradio 4.x(无 show_download_button 等参数)。"""
|
||||
kwargs: dict = {
|
||||
"label": label,
|
||||
"type": "filepath",
|
||||
"interactive": False,
|
||||
"elem_classes": ["tts-output-audio"],
|
||||
}
|
||||
params = inspect.signature(gr.Audio.__init__).parameters
|
||||
if "show_download_button" in params:
|
||||
kwargs["show_download_button"] = True
|
||||
if "show_share_button" in params:
|
||||
kwargs["show_share_button"] = False
|
||||
return gr.Audio(**kwargs)
|
||||
def _voice_player_block() -> gr.HTML:
|
||||
"""创建成品配音 HTML 播放器区域。"""
|
||||
return gr.HTML(value=_voice_player_html(None), elem_classes=["tts-player-block"])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -181,8 +203,8 @@ def _short_synth_log(msg: str, ok: bool) -> str:
|
||||
segs = re.search(r"共\s*(\d+)\s*段", msg)
|
||||
if chars:
|
||||
seg_note = f",{segs.group(1)} 段拼接" if segs else ""
|
||||
return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、下载。"
|
||||
return "✅ 配音完成。请用下方播放器试听、下载。"
|
||||
return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、调节语速或下载。"
|
||||
return "✅ 配音完成。请用下方播放器试听、调节语速或下载。"
|
||||
|
||||
|
||||
def ui_synth_pending(polished_text: str) -> str:
|
||||
@@ -196,14 +218,14 @@ def ui_synth_pending(polished_text: str) -> str:
|
||||
)
|
||||
|
||||
|
||||
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]:
|
||||
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, str, dict, str]:
|
||||
"""【TTS 合成】生成最终 wav 配音文件。"""
|
||||
if not polished_text or not polished_text.strip():
|
||||
return (
|
||||
"请先完成 Gemma4 润色。",
|
||||
gr.update(value=None),
|
||||
_voice_player_html(None),
|
||||
ui_history_dropdown(),
|
||||
gr.update(value=None),
|
||||
_voice_player_html(None),
|
||||
)
|
||||
|
||||
voice_id = label_to_voice_id(voice_label)
|
||||
@@ -211,15 +233,15 @@ def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict
|
||||
if ok:
|
||||
return (
|
||||
_short_synth_log(msg, ok),
|
||||
gr.update(value=wav_path),
|
||||
_voice_player_html(wav_path),
|
||||
ui_history_dropdown(wav_path),
|
||||
gr.update(value=wav_path),
|
||||
_voice_player_html(wav_path),
|
||||
)
|
||||
return (
|
||||
_short_synth_log(msg, ok),
|
||||
gr.update(value=None),
|
||||
_voice_player_html(None),
|
||||
ui_history_dropdown(),
|
||||
gr.update(value=None),
|
||||
_voice_player_html(None),
|
||||
)
|
||||
|
||||
|
||||
@@ -231,7 +253,7 @@ def ui_full_pipeline(
|
||||
skip_polish: bool,
|
||||
manual_raw: str,
|
||||
voice_label: str,
|
||||
) -> tuple[str, str, dict, str, dict, dict]:
|
||||
) -> tuple[str, str, str, str, dict, str]:
|
||||
"""
|
||||
串联执行:识别 → 润色(可跳过)→ 合成。
|
||||
返回 (raw, polished, wav_path, log)
|
||||
@@ -245,10 +267,10 @@ def ui_full_pipeline(
|
||||
else:
|
||||
path = _save_upload(audio_file)
|
||||
if not path:
|
||||
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None)
|
||||
return "", "", _voice_player_html(None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), _voice_player_html(None)
|
||||
ok, result = transcribe_audio(path)
|
||||
if not ok:
|
||||
return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None)
|
||||
return "", "", _voice_player_html(None), f"❌ 识别失败: {result}", ui_history_dropdown(), _voice_player_html(None)
|
||||
raw = result
|
||||
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
|
||||
|
||||
@@ -259,7 +281,7 @@ def ui_full_pipeline(
|
||||
else:
|
||||
ok, result = polish_text(raw)
|
||||
if not ok:
|
||||
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
|
||||
return raw, "", _voice_player_html(None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
|
||||
polished = result
|
||||
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
|
||||
|
||||
@@ -267,16 +289,16 @@ def ui_full_pipeline(
|
||||
voice_id = label_to_voice_id(voice_label)
|
||||
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
|
||||
if not ok:
|
||||
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
|
||||
return raw, polished, _voice_player_html(None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
|
||||
|
||||
logs.append(f"✅ {msg}")
|
||||
return (
|
||||
raw,
|
||||
polished,
|
||||
gr.update(value=wav_path),
|
||||
_voice_player_html(wav_path),
|
||||
"\n".join(logs),
|
||||
ui_history_dropdown(wav_path),
|
||||
gr.update(value=wav_path),
|
||||
_voice_player_html(wav_path),
|
||||
)
|
||||
|
||||
|
||||
@@ -971,24 +993,68 @@ gradio-app,
|
||||
.gradio-container .waveform-container {
|
||||
background: #1a2332 !important;
|
||||
}
|
||||
/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */
|
||||
.gradio-container .tts-output-audio,
|
||||
.gradio-container .tts-output-audio .audio-container {
|
||||
/* 成品 HTML 播放器:播放 + 语速滑块 */
|
||||
.tts-player-block { contain: layout style paint; }
|
||||
.tts-player-wrap {
|
||||
background: #1a2332 !important;
|
||||
border: 1px solid #374151 !important;
|
||||
background: #1a2332 !important;
|
||||
contain: strict;
|
||||
min-height: 120px;
|
||||
border-radius: 10px !important;
|
||||
padding: 14px 16px !important;
|
||||
margin: 8px 0 !important;
|
||||
}
|
||||
.gradio-container .tts-output-audio button,
|
||||
.gradio-container .tts-output-audio button:focus,
|
||||
.gradio-container .tts-output-audio button:focus-visible {
|
||||
outline: none !important;
|
||||
box-shadow: none !important;
|
||||
border-color: #4b5563 !important;
|
||||
.tts-player-wrap.tts-player-empty {
|
||||
color: #94a3b8 !important;
|
||||
font-size: 0.92rem !important;
|
||||
min-height: 72px;
|
||||
}
|
||||
.gradio-container .tts-output-audio .wrap,
|
||||
.gradio-container .tts-output-audio .controls {
|
||||
background: #1a2332 !important;
|
||||
.tts-player-title {
|
||||
color: #93c5fd !important;
|
||||
font-size: 0.88rem !important;
|
||||
margin-bottom: 10px !important;
|
||||
word-break: break-all;
|
||||
}
|
||||
.tts-player-wrap audio.tts-audio-el {
|
||||
width: 100% !important;
|
||||
height: 44px !important;
|
||||
margin: 6px 0 12px 0 !important;
|
||||
border-radius: 6px !important;
|
||||
}
|
||||
.tts-speed-row {
|
||||
display: flex !important;
|
||||
flex-wrap: wrap !important;
|
||||
align-items: center !important;
|
||||
gap: 10px 14px !important;
|
||||
margin-top: 4px !important;
|
||||
}
|
||||
.tts-speed-label-text {
|
||||
color: #e5e7eb !important;
|
||||
font-size: 0.9rem !important;
|
||||
min-width: 64px;
|
||||
}
|
||||
.tts-speed-slider {
|
||||
flex: 1 1 120px !important;
|
||||
min-width: 120px !important;
|
||||
max-width: 280px !important;
|
||||
accent-color: #2563eb !important;
|
||||
}
|
||||
.tts-speed-val {
|
||||
color: #93c5fd !important;
|
||||
font-weight: 600 !important;
|
||||
min-width: 48px !important;
|
||||
}
|
||||
.tts-dl-btn {
|
||||
color: #ffffff !important;
|
||||
background: #374151 !important;
|
||||
padding: 6px 12px !important;
|
||||
border-radius: 6px !important;
|
||||
text-decoration: none !important;
|
||||
font-size: 0.85rem !important;
|
||||
}
|
||||
.tts-dl-btn:hover { background: #4b5563 !important; }
|
||||
.tts-player-tip {
|
||||
color: #64748b !important;
|
||||
font-size: 0.78rem !important;
|
||||
margin: 10px 0 0 0 !important;
|
||||
}
|
||||
.gradio-container .pipeline-step-card textarea {
|
||||
contain: layout style;
|
||||
@@ -1124,7 +1190,7 @@ def build_app() -> gr.Blocks:
|
||||
with gr.Row(elem_classes=["pipeline-output-row"]):
|
||||
pipe_raw = gr.Textbox(label="转写原文", lines=6)
|
||||
pipe_polished = gr.Textbox(label="润色稿", lines=6)
|
||||
pipe_output = _tts_output_audio("成品配音")
|
||||
pipe_player = _voice_player_block()
|
||||
|
||||
# ---- Tab 2: 分步流水线 ----
|
||||
with gr.Tab("分步流水线"):
|
||||
@@ -1171,7 +1237,7 @@ def build_app() -> gr.Blocks:
|
||||
)
|
||||
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
|
||||
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
|
||||
output_audio = _tts_output_audio("成品配音")
|
||||
output_player = _voice_player_block()
|
||||
|
||||
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
|
||||
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
|
||||
@@ -1262,7 +1328,7 @@ def build_app() -> gr.Blocks:
|
||||
scale=4,
|
||||
)
|
||||
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
|
||||
history_player = _tts_output_audio("历史试听 / 下载")
|
||||
history_player = _voice_player_block()
|
||||
|
||||
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
|
||||
history_select.change(ui_history_play, history_select, history_player)
|
||||
@@ -1283,7 +1349,7 @@ def build_app() -> gr.Blocks:
|
||||
).then(
|
||||
ui_full_pipeline,
|
||||
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
|
||||
[pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player],
|
||||
[pipe_raw, pipe_polished, pipe_player, pipeline_log, history_select, history_player],
|
||||
queue=True,
|
||||
)
|
||||
synth_btn.click(
|
||||
@@ -1294,7 +1360,7 @@ def build_app() -> gr.Blocks:
|
||||
).then(
|
||||
ui_synthesize,
|
||||
[polished_text, tts_voice],
|
||||
[synth_log, output_audio, history_select, history_player],
|
||||
[synth_log, output_player, history_select, history_player],
|
||||
queue=True,
|
||||
)
|
||||
|
||||
@@ -1335,6 +1401,24 @@ def create_fastapi_app():
|
||||
headers={"Service-Worker-Allowed": "/"},
|
||||
)
|
||||
|
||||
@fastapi_app.get("/outputs/{filename}")
|
||||
async def serve_output_wav(filename: str):
|
||||
"""供 HTML 播放器直接加载 outputs 下的配音文件。"""
|
||||
from fastapi import HTTPException
|
||||
|
||||
if (
|
||||
".." in filename
|
||||
or "/" in filename
|
||||
or "\\" in filename
|
||||
or not filename.startswith("voiceover_")
|
||||
or not filename.endswith(".wav")
|
||||
):
|
||||
raise HTTPException(status_code=404)
|
||||
path = OUTPUT_DIR / filename
|
||||
if not path.is_file():
|
||||
raise HTTPException(status_code=404)
|
||||
return FileResponse(path, media_type="audio/wav", filename=filename)
|
||||
|
||||
blocks = build_app()
|
||||
gr.mount_gradio_app(
|
||||
fastapi_app,
|
||||
|
||||
Reference in New Issue
Block a user