Add playback speed control for generated voiceovers
Replace Gradio Audio output with an HTML player that supports play/pause and a 0.5x-2.0x speed slider, plus direct /outputs WAV download. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -5,13 +5,13 @@ Gradio Web 中控:音色锁定 → Whisper 识别 → Gemma4 润色 → ChatTT
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import inspect
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import uuid
|
import uuid
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
|
|
||||||
@@ -20,6 +20,7 @@ from config import (
|
|||||||
HOST,
|
HOST,
|
||||||
MODEL_NAME,
|
MODEL_NAME,
|
||||||
OLLAMA_URL,
|
OLLAMA_URL,
|
||||||
|
OUTPUT_DIR,
|
||||||
PORT,
|
PORT,
|
||||||
SPEAKER_EMB_PATH,
|
SPEAKER_EMB_PATH,
|
||||||
UPLOAD_DIR,
|
UPLOAD_DIR,
|
||||||
@@ -60,35 +61,56 @@ def ui_history_dropdown(select_path: str | None = None) -> dict:
|
|||||||
return gr.update(choices=choices, value=value)
|
return gr.update(choices=choices, value=value)
|
||||||
|
|
||||||
|
|
||||||
def ui_history_play(filepath: str | None) -> dict:
|
def _voice_player_html(wav_path: str | None) -> str:
|
||||||
|
"""带播放控件与语速滑块的 HTML 播放器(语速仅影响试听,不改变 WAV 文件)。"""
|
||||||
|
if not wav_path:
|
||||||
|
return (
|
||||||
|
'<div class="tts-player-wrap tts-player-empty">'
|
||||||
|
"<p>合成完成后可在此试听,拖动下方滑块调节播放语速(0.5x~2.0x)。</p>"
|
||||||
|
"</div>"
|
||||||
|
)
|
||||||
|
path = Path(wav_path)
|
||||||
|
if not path.is_file():
|
||||||
|
return (
|
||||||
|
'<div class="tts-player-wrap tts-player-empty">'
|
||||||
|
"<p>音频文件不存在,请重新合成或刷新历史列表。</p>"
|
||||||
|
"</div>"
|
||||||
|
)
|
||||||
|
name = path.name
|
||||||
|
src = f"/outputs/{quote(name)}"
|
||||||
|
return f"""
|
||||||
|
<div class="tts-player-wrap">
|
||||||
|
<div class="tts-player-title">🎧 {name}</div>
|
||||||
|
<audio class="tts-audio-el" controls preload="metadata" src="{src}"></audio>
|
||||||
|
<div class="tts-speed-row">
|
||||||
|
<span class="tts-speed-label-text">播放语速</span>
|
||||||
|
<input type="range" class="tts-speed-slider" min="0.5" max="2.0" step="0.05" value="1"
|
||||||
|
aria-label="播放语速"
|
||||||
|
oninput="(function(el){{var w=el.closest('.tts-player-wrap'); var a=w&&w.querySelector('audio'); if(a){{a.playbackRate=parseFloat(el.value);}} var s=w&&w.querySelector('.tts-speed-val'); if(s){{s.textContent=parseFloat(el.value).toFixed(2)+'x';}}}})(this)">
|
||||||
|
<span class="tts-speed-val">1.00x</span>
|
||||||
|
<a class="tts-dl-btn" href="{src}" download="{name}">⬇ 下载 WAV</a>
|
||||||
|
</div>
|
||||||
|
<p class="tts-player-tip">语速仅用于试听,下载的 WAV 仍为原速。</p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def ui_history_play(filepath: str | None) -> str:
|
||||||
"""选中历史条目后加载播放器。"""
|
"""选中历史条目后加载播放器。"""
|
||||||
if filepath and Path(filepath).is_file():
|
return _voice_player_html(filepath)
|
||||||
return gr.update(value=filepath)
|
|
||||||
return gr.update(value=None)
|
|
||||||
|
|
||||||
|
|
||||||
def ui_initial_history() -> tuple[dict, dict]:
|
def ui_initial_history() -> tuple[dict, str]:
|
||||||
"""首屏加载历史列表并自动选中最新一条。"""
|
"""首屏加载历史列表并自动选中最新一条。"""
|
||||||
choices = list_voice_history()
|
choices = list_voice_history()
|
||||||
paths = [p for _, p in choices]
|
paths = [p for _, p in choices]
|
||||||
latest = paths[0] if paths else None
|
latest = paths[0] if paths else None
|
||||||
return gr.update(choices=choices, value=latest), ui_history_play(latest)
|
return gr.update(choices=choices, value=latest), _voice_player_html(latest)
|
||||||
|
|
||||||
|
|
||||||
def _tts_output_audio(label: str) -> gr.Audio:
|
def _voice_player_block() -> gr.HTML:
|
||||||
"""成品播放器:兼容 Gradio 4.x(无 show_download_button 等参数)。"""
|
"""创建成品配音 HTML 播放器区域。"""
|
||||||
kwargs: dict = {
|
return gr.HTML(value=_voice_player_html(None), elem_classes=["tts-player-block"])
|
||||||
"label": label,
|
|
||||||
"type": "filepath",
|
|
||||||
"interactive": False,
|
|
||||||
"elem_classes": ["tts-output-audio"],
|
|
||||||
}
|
|
||||||
params = inspect.signature(gr.Audio.__init__).parameters
|
|
||||||
if "show_download_button" in params:
|
|
||||||
kwargs["show_download_button"] = True
|
|
||||||
if "show_share_button" in params:
|
|
||||||
kwargs["show_share_button"] = False
|
|
||||||
return gr.Audio(**kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -181,8 +203,8 @@ def _short_synth_log(msg: str, ok: bool) -> str:
|
|||||||
segs = re.search(r"共\s*(\d+)\s*段", msg)
|
segs = re.search(r"共\s*(\d+)\s*段", msg)
|
||||||
if chars:
|
if chars:
|
||||||
seg_note = f",{segs.group(1)} 段拼接" if segs else ""
|
seg_note = f",{segs.group(1)} 段拼接" if segs else ""
|
||||||
return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、下载。"
|
return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、调节语速或下载。"
|
||||||
return "✅ 配音完成。请用下方播放器试听、下载。"
|
return "✅ 配音完成。请用下方播放器试听、调节语速或下载。"
|
||||||
|
|
||||||
|
|
||||||
def ui_synth_pending(polished_text: str) -> str:
|
def ui_synth_pending(polished_text: str) -> str:
|
||||||
@@ -196,14 +218,14 @@ def ui_synth_pending(polished_text: str) -> str:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]:
|
def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, str, dict, str]:
|
||||||
"""【TTS 合成】生成最终 wav 配音文件。"""
|
"""【TTS 合成】生成最终 wav 配音文件。"""
|
||||||
if not polished_text or not polished_text.strip():
|
if not polished_text or not polished_text.strip():
|
||||||
return (
|
return (
|
||||||
"请先完成 Gemma4 润色。",
|
"请先完成 Gemma4 润色。",
|
||||||
gr.update(value=None),
|
_voice_player_html(None),
|
||||||
ui_history_dropdown(),
|
ui_history_dropdown(),
|
||||||
gr.update(value=None),
|
_voice_player_html(None),
|
||||||
)
|
)
|
||||||
|
|
||||||
voice_id = label_to_voice_id(voice_label)
|
voice_id = label_to_voice_id(voice_label)
|
||||||
@@ -211,15 +233,15 @@ def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict
|
|||||||
if ok:
|
if ok:
|
||||||
return (
|
return (
|
||||||
_short_synth_log(msg, ok),
|
_short_synth_log(msg, ok),
|
||||||
gr.update(value=wav_path),
|
_voice_player_html(wav_path),
|
||||||
ui_history_dropdown(wav_path),
|
ui_history_dropdown(wav_path),
|
||||||
gr.update(value=wav_path),
|
_voice_player_html(wav_path),
|
||||||
)
|
)
|
||||||
return (
|
return (
|
||||||
_short_synth_log(msg, ok),
|
_short_synth_log(msg, ok),
|
||||||
gr.update(value=None),
|
_voice_player_html(None),
|
||||||
ui_history_dropdown(),
|
ui_history_dropdown(),
|
||||||
gr.update(value=None),
|
_voice_player_html(None),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -231,7 +253,7 @@ def ui_full_pipeline(
|
|||||||
skip_polish: bool,
|
skip_polish: bool,
|
||||||
manual_raw: str,
|
manual_raw: str,
|
||||||
voice_label: str,
|
voice_label: str,
|
||||||
) -> tuple[str, str, dict, str, dict, dict]:
|
) -> tuple[str, str, str, str, dict, str]:
|
||||||
"""
|
"""
|
||||||
串联执行:识别 → 润色(可跳过)→ 合成。
|
串联执行:识别 → 润色(可跳过)→ 合成。
|
||||||
返回 (raw, polished, wav_path, log)
|
返回 (raw, polished, wav_path, log)
|
||||||
@@ -245,10 +267,10 @@ def ui_full_pipeline(
|
|||||||
else:
|
else:
|
||||||
path = _save_upload(audio_file)
|
path = _save_upload(audio_file)
|
||||||
if not path:
|
if not path:
|
||||||
return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None)
|
return "", "", _voice_player_html(None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), _voice_player_html(None)
|
||||||
ok, result = transcribe_audio(path)
|
ok, result = transcribe_audio(path)
|
||||||
if not ok:
|
if not ok:
|
||||||
return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None)
|
return "", "", _voice_player_html(None), f"❌ 识别失败: {result}", ui_history_dropdown(), _voice_player_html(None)
|
||||||
raw = result
|
raw = result
|
||||||
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
|
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
|
||||||
|
|
||||||
@@ -259,7 +281,7 @@ def ui_full_pipeline(
|
|||||||
else:
|
else:
|
||||||
ok, result = polish_text(raw)
|
ok, result = polish_text(raw)
|
||||||
if not ok:
|
if not ok:
|
||||||
return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
|
return raw, "", _voice_player_html(None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
|
||||||
polished = result
|
polished = result
|
||||||
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
|
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
|
||||||
|
|
||||||
@@ -267,16 +289,16 @@ def ui_full_pipeline(
|
|||||||
voice_id = label_to_voice_id(voice_label)
|
voice_id = label_to_voice_id(voice_label)
|
||||||
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
|
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
|
||||||
if not ok:
|
if not ok:
|
||||||
return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
|
return raw, polished, _voice_player_html(None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
|
||||||
|
|
||||||
logs.append(f"✅ {msg}")
|
logs.append(f"✅ {msg}")
|
||||||
return (
|
return (
|
||||||
raw,
|
raw,
|
||||||
polished,
|
polished,
|
||||||
gr.update(value=wav_path),
|
_voice_player_html(wav_path),
|
||||||
"\n".join(logs),
|
"\n".join(logs),
|
||||||
ui_history_dropdown(wav_path),
|
ui_history_dropdown(wav_path),
|
||||||
gr.update(value=wav_path),
|
_voice_player_html(wav_path),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -971,24 +993,68 @@ gradio-app,
|
|||||||
.gradio-container .waveform-container {
|
.gradio-container .waveform-container {
|
||||||
background: #1a2332 !important;
|
background: #1a2332 !important;
|
||||||
}
|
}
|
||||||
/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */
|
/* 成品 HTML 播放器:播放 + 语速滑块 */
|
||||||
.gradio-container .tts-output-audio,
|
.tts-player-block { contain: layout style paint; }
|
||||||
.gradio-container .tts-output-audio .audio-container {
|
.tts-player-wrap {
|
||||||
|
background: #1a2332 !important;
|
||||||
border: 1px solid #374151 !important;
|
border: 1px solid #374151 !important;
|
||||||
background: #1a2332 !important;
|
border-radius: 10px !important;
|
||||||
contain: strict;
|
padding: 14px 16px !important;
|
||||||
min-height: 120px;
|
margin: 8px 0 !important;
|
||||||
}
|
}
|
||||||
.gradio-container .tts-output-audio button,
|
.tts-player-wrap.tts-player-empty {
|
||||||
.gradio-container .tts-output-audio button:focus,
|
color: #94a3b8 !important;
|
||||||
.gradio-container .tts-output-audio button:focus-visible {
|
font-size: 0.92rem !important;
|
||||||
outline: none !important;
|
min-height: 72px;
|
||||||
box-shadow: none !important;
|
|
||||||
border-color: #4b5563 !important;
|
|
||||||
}
|
}
|
||||||
.gradio-container .tts-output-audio .wrap,
|
.tts-player-title {
|
||||||
.gradio-container .tts-output-audio .controls {
|
color: #93c5fd !important;
|
||||||
background: #1a2332 !important;
|
font-size: 0.88rem !important;
|
||||||
|
margin-bottom: 10px !important;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
.tts-player-wrap audio.tts-audio-el {
|
||||||
|
width: 100% !important;
|
||||||
|
height: 44px !important;
|
||||||
|
margin: 6px 0 12px 0 !important;
|
||||||
|
border-radius: 6px !important;
|
||||||
|
}
|
||||||
|
.tts-speed-row {
|
||||||
|
display: flex !important;
|
||||||
|
flex-wrap: wrap !important;
|
||||||
|
align-items: center !important;
|
||||||
|
gap: 10px 14px !important;
|
||||||
|
margin-top: 4px !important;
|
||||||
|
}
|
||||||
|
.tts-speed-label-text {
|
||||||
|
color: #e5e7eb !important;
|
||||||
|
font-size: 0.9rem !important;
|
||||||
|
min-width: 64px;
|
||||||
|
}
|
||||||
|
.tts-speed-slider {
|
||||||
|
flex: 1 1 120px !important;
|
||||||
|
min-width: 120px !important;
|
||||||
|
max-width: 280px !important;
|
||||||
|
accent-color: #2563eb !important;
|
||||||
|
}
|
||||||
|
.tts-speed-val {
|
||||||
|
color: #93c5fd !important;
|
||||||
|
font-weight: 600 !important;
|
||||||
|
min-width: 48px !important;
|
||||||
|
}
|
||||||
|
.tts-dl-btn {
|
||||||
|
color: #ffffff !important;
|
||||||
|
background: #374151 !important;
|
||||||
|
padding: 6px 12px !important;
|
||||||
|
border-radius: 6px !important;
|
||||||
|
text-decoration: none !important;
|
||||||
|
font-size: 0.85rem !important;
|
||||||
|
}
|
||||||
|
.tts-dl-btn:hover { background: #4b5563 !important; }
|
||||||
|
.tts-player-tip {
|
||||||
|
color: #64748b !important;
|
||||||
|
font-size: 0.78rem !important;
|
||||||
|
margin: 10px 0 0 0 !important;
|
||||||
}
|
}
|
||||||
.gradio-container .pipeline-step-card textarea {
|
.gradio-container .pipeline-step-card textarea {
|
||||||
contain: layout style;
|
contain: layout style;
|
||||||
@@ -1124,7 +1190,7 @@ def build_app() -> gr.Blocks:
|
|||||||
with gr.Row(elem_classes=["pipeline-output-row"]):
|
with gr.Row(elem_classes=["pipeline-output-row"]):
|
||||||
pipe_raw = gr.Textbox(label="转写原文", lines=6)
|
pipe_raw = gr.Textbox(label="转写原文", lines=6)
|
||||||
pipe_polished = gr.Textbox(label="润色稿", lines=6)
|
pipe_polished = gr.Textbox(label="润色稿", lines=6)
|
||||||
pipe_output = _tts_output_audio("成品配音")
|
pipe_player = _voice_player_block()
|
||||||
|
|
||||||
# ---- Tab 2: 分步流水线 ----
|
# ---- Tab 2: 分步流水线 ----
|
||||||
with gr.Tab("分步流水线"):
|
with gr.Tab("分步流水线"):
|
||||||
@@ -1171,7 +1237,7 @@ def build_app() -> gr.Blocks:
|
|||||||
)
|
)
|
||||||
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
|
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
|
||||||
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
|
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
|
||||||
output_audio = _tts_output_audio("成品配音")
|
output_player = _voice_player_block()
|
||||||
|
|
||||||
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
|
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
|
||||||
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
|
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
|
||||||
@@ -1262,7 +1328,7 @@ def build_app() -> gr.Blocks:
|
|||||||
scale=4,
|
scale=4,
|
||||||
)
|
)
|
||||||
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
|
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
|
||||||
history_player = _tts_output_audio("历史试听 / 下载")
|
history_player = _voice_player_block()
|
||||||
|
|
||||||
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
|
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
|
||||||
history_select.change(ui_history_play, history_select, history_player)
|
history_select.change(ui_history_play, history_select, history_player)
|
||||||
@@ -1283,7 +1349,7 @@ def build_app() -> gr.Blocks:
|
|||||||
).then(
|
).then(
|
||||||
ui_full_pipeline,
|
ui_full_pipeline,
|
||||||
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
|
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
|
||||||
[pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player],
|
[pipe_raw, pipe_polished, pipe_player, pipeline_log, history_select, history_player],
|
||||||
queue=True,
|
queue=True,
|
||||||
)
|
)
|
||||||
synth_btn.click(
|
synth_btn.click(
|
||||||
@@ -1294,7 +1360,7 @@ def build_app() -> gr.Blocks:
|
|||||||
).then(
|
).then(
|
||||||
ui_synthesize,
|
ui_synthesize,
|
||||||
[polished_text, tts_voice],
|
[polished_text, tts_voice],
|
||||||
[synth_log, output_audio, history_select, history_player],
|
[synth_log, output_player, history_select, history_player],
|
||||||
queue=True,
|
queue=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1335,6 +1401,24 @@ def create_fastapi_app():
|
|||||||
headers={"Service-Worker-Allowed": "/"},
|
headers={"Service-Worker-Allowed": "/"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@fastapi_app.get("/outputs/{filename}")
|
||||||
|
async def serve_output_wav(filename: str):
|
||||||
|
"""供 HTML 播放器直接加载 outputs 下的配音文件。"""
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
if (
|
||||||
|
".." in filename
|
||||||
|
or "/" in filename
|
||||||
|
or "\\" in filename
|
||||||
|
or not filename.startswith("voiceover_")
|
||||||
|
or not filename.endswith(".wav")
|
||||||
|
):
|
||||||
|
raise HTTPException(status_code=404)
|
||||||
|
path = OUTPUT_DIR / filename
|
||||||
|
if not path.is_file():
|
||||||
|
raise HTTPException(status_code=404)
|
||||||
|
return FileResponse(path, media_type="audio/wav", filename=filename)
|
||||||
|
|
||||||
blocks = build_app()
|
blocks = build_app()
|
||||||
gr.mount_gradio_app(
|
gr.mount_gradio_app(
|
||||||
fastapi_app,
|
fastapi_app,
|
||||||
|
|||||||
Reference in New Issue
Block a user