diff --git a/app.py b/app.py
index 5066aa2..6768426 100644
--- a/app.py
+++ b/app.py
@@ -5,13 +5,13 @@ Gradio Web 中控:音色锁定 → Whisper 识别 → Gemma4 润色 → ChatTT
from __future__ import annotations
-import inspect
import logging
import re
import shutil
import sys
import uuid
from pathlib import Path
+from urllib.parse import quote
import gradio as gr
@@ -20,6 +20,7 @@ from config import (
HOST,
MODEL_NAME,
OLLAMA_URL,
+ OUTPUT_DIR,
PORT,
SPEAKER_EMB_PATH,
UPLOAD_DIR,
@@ -60,35 +61,56 @@ def ui_history_dropdown(select_path: str | None = None) -> dict:
return gr.update(choices=choices, value=value)
-def ui_history_play(filepath: str | None) -> dict:
+def _voice_player_html(wav_path: str | None) -> str:
+ """带播放控件与语速滑块的 HTML 播放器(语速仅影响试听,不改变 WAV 文件)。"""
+ if not wav_path:
+ return (
+ '
'
+ "
合成完成后可在此试听,拖动下方滑块调节播放语速(0.5x~2.0x)。
"
+ "
"
+ )
+ path = Path(wav_path)
+ if not path.is_file():
+ return (
+ ''
+ "
音频文件不存在,请重新合成或刷新历史列表。
"
+ "
"
+ )
+ name = path.name
+ src = f"/outputs/{quote(name)}"
+ return f"""
+
+
🎧 {name}
+
+
+
语速仅用于试听,下载的 WAV 仍为原速。
+
+"""
+
+
+def ui_history_play(filepath: str | None) -> str:
"""选中历史条目后加载播放器。"""
- if filepath and Path(filepath).is_file():
- return gr.update(value=filepath)
- return gr.update(value=None)
+ return _voice_player_html(filepath)
-def ui_initial_history() -> tuple[dict, dict]:
+def ui_initial_history() -> tuple[dict, str]:
"""首屏加载历史列表并自动选中最新一条。"""
choices = list_voice_history()
paths = [p for _, p in choices]
latest = paths[0] if paths else None
- return gr.update(choices=choices, value=latest), ui_history_play(latest)
+ return gr.update(choices=choices, value=latest), _voice_player_html(latest)
-def _tts_output_audio(label: str) -> gr.Audio:
- """成品播放器:兼容 Gradio 4.x(无 show_download_button 等参数)。"""
- kwargs: dict = {
- "label": label,
- "type": "filepath",
- "interactive": False,
- "elem_classes": ["tts-output-audio"],
- }
- params = inspect.signature(gr.Audio.__init__).parameters
- if "show_download_button" in params:
- kwargs["show_download_button"] = True
- if "show_share_button" in params:
- kwargs["show_share_button"] = False
- return gr.Audio(**kwargs)
+def _voice_player_block() -> gr.HTML:
+ """创建成品配音 HTML 播放器区域。"""
+ return gr.HTML(value=_voice_player_html(None), elem_classes=["tts-player-block"])
# ---------------------------------------------------------------------------
@@ -181,8 +203,8 @@ def _short_synth_log(msg: str, ok: bool) -> str:
segs = re.search(r"共\s*(\d+)\s*段", msg)
if chars:
seg_note = f",{segs.group(1)} 段拼接" if segs else ""
- return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、下载。"
- return "✅ 配音完成。请用下方播放器试听、下载。"
+ return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、调节语速或下载。"
+ return "✅ 配音完成。请用下方播放器试听、调节语速或下载。"
def ui_synth_pending(polished_text: str) -> str:
@@ -196,14 +218,14 @@ def ui_synth_pending(polished_text: str) -> str:
)
-def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]:
+def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, str, dict, str]:
"""【TTS 合成】生成最终 wav 配音文件。"""
if not polished_text or not polished_text.strip():
return (
"请先完成 Gemma4 润色。",
- gr.update(value=None),
+ _voice_player_html(None),
ui_history_dropdown(),
- gr.update(value=None),
+ _voice_player_html(None),
)
voice_id = label_to_voice_id(voice_label)
@@ -211,15 +233,15 @@ def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict
if ok:
return (
_short_synth_log(msg, ok),
- gr.update(value=wav_path),
+ _voice_player_html(wav_path),
ui_history_dropdown(wav_path),
- gr.update(value=wav_path),
+ _voice_player_html(wav_path),
)
return (
_short_synth_log(msg, ok),
- gr.update(value=None),
+ _voice_player_html(None),
ui_history_dropdown(),
- gr.update(value=None),
+ _voice_player_html(None),
)
@@ -231,7 +253,7 @@ def ui_full_pipeline(
skip_polish: bool,
manual_raw: str,
voice_label: str,
-) -> tuple[str, str, dict, str, dict, dict]:
+) -> tuple[str, str, str, str, dict, str]:
"""
串联执行:识别 → 润色(可跳过)→ 合成。
返回 (raw, polished, wav_path, log)
@@ -245,10 +267,10 @@ def ui_full_pipeline(
else:
path = _save_upload(audio_file)
if not path:
- return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None)
+ return "", "", _voice_player_html(None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), _voice_player_html(None)
ok, result = transcribe_audio(path)
if not ok:
- return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None)
+ return "", "", _voice_player_html(None), f"❌ 识别失败: {result}", ui_history_dropdown(), _voice_player_html(None)
raw = result
logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。")
@@ -259,7 +281,7 @@ def ui_full_pipeline(
else:
ok, result = polish_text(raw)
if not ok:
- return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
+ return raw, "", _voice_player_html(None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
polished = result
logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。")
@@ -267,16 +289,16 @@ def ui_full_pipeline(
voice_id = label_to_voice_id(voice_label)
ok, msg, wav_path = generate_voice(polished, voice_id=voice_id)
if not ok:
- return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None)
+ return raw, polished, _voice_player_html(None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None)
logs.append(f"✅ {msg}")
return (
raw,
polished,
- gr.update(value=wav_path),
+ _voice_player_html(wav_path),
"\n".join(logs),
ui_history_dropdown(wav_path),
- gr.update(value=wav_path),
+ _voice_player_html(wav_path),
)
@@ -971,24 +993,68 @@ gradio-app,
.gradio-container .waveform-container {
background: #1a2332 !important;
}
-/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */
-.gradio-container .tts-output-audio,
-.gradio-container .tts-output-audio .audio-container {
+/* 成品 HTML 播放器:播放 + 语速滑块 */
+.tts-player-block { contain: layout style paint; }
+.tts-player-wrap {
+ background: #1a2332 !important;
border: 1px solid #374151 !important;
- background: #1a2332 !important;
- contain: strict;
- min-height: 120px;
+ border-radius: 10px !important;
+ padding: 14px 16px !important;
+ margin: 8px 0 !important;
}
-.gradio-container .tts-output-audio button,
-.gradio-container .tts-output-audio button:focus,
-.gradio-container .tts-output-audio button:focus-visible {
- outline: none !important;
- box-shadow: none !important;
- border-color: #4b5563 !important;
+.tts-player-wrap.tts-player-empty {
+ color: #94a3b8 !important;
+ font-size: 0.92rem !important;
+ min-height: 72px;
}
-.gradio-container .tts-output-audio .wrap,
-.gradio-container .tts-output-audio .controls {
- background: #1a2332 !important;
+.tts-player-title {
+ color: #93c5fd !important;
+ font-size: 0.88rem !important;
+ margin-bottom: 10px !important;
+ word-break: break-all;
+}
+.tts-player-wrap audio.tts-audio-el {
+ width: 100% !important;
+ height: 44px !important;
+ margin: 6px 0 12px 0 !important;
+ border-radius: 6px !important;
+}
+.tts-speed-row {
+ display: flex !important;
+ flex-wrap: wrap !important;
+ align-items: center !important;
+ gap: 10px 14px !important;
+ margin-top: 4px !important;
+}
+.tts-speed-label-text {
+ color: #e5e7eb !important;
+ font-size: 0.9rem !important;
+ min-width: 64px;
+}
+.tts-speed-slider {
+ flex: 1 1 120px !important;
+ min-width: 120px !important;
+ max-width: 280px !important;
+ accent-color: #2563eb !important;
+}
+.tts-speed-val {
+ color: #93c5fd !important;
+ font-weight: 600 !important;
+ min-width: 48px !important;
+}
+.tts-dl-btn {
+ color: #ffffff !important;
+ background: #374151 !important;
+ padding: 6px 12px !important;
+ border-radius: 6px !important;
+ text-decoration: none !important;
+ font-size: 0.85rem !important;
+}
+.tts-dl-btn:hover { background: #4b5563 !important; }
+.tts-player-tip {
+ color: #64748b !important;
+ font-size: 0.78rem !important;
+ margin: 10px 0 0 0 !important;
}
.gradio-container .pipeline-step-card textarea {
contain: layout style;
@@ -1124,7 +1190,7 @@ def build_app() -> gr.Blocks:
with gr.Row(elem_classes=["pipeline-output-row"]):
pipe_raw = gr.Textbox(label="转写原文", lines=6)
pipe_polished = gr.Textbox(label="润色稿", lines=6)
- pipe_output = _tts_output_audio("成品配音")
+ pipe_player = _voice_player_block()
# ---- Tab 2: 分步流水线 ----
with gr.Tab("分步流水线"):
@@ -1171,7 +1237,7 @@ def build_app() -> gr.Blocks:
)
synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary")
synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False)
- output_audio = _tts_output_audio("成品配音")
+ output_player = _voice_player_block()
transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log])
polish_btn.click(ui_polish, raw_text, [polished_text, polish_log])
@@ -1262,7 +1328,7 @@ def build_app() -> gr.Blocks:
scale=4,
)
history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100)
- history_player = _tts_output_audio("历史试听 / 下载")
+ history_player = _voice_player_block()
history_refresh_btn.click(ui_history_dropdown, outputs=[history_select])
history_select.change(ui_history_play, history_select, history_player)
@@ -1283,7 +1349,7 @@ def build_app() -> gr.Blocks:
).then(
ui_full_pipeline,
[pipe_audio, skip_polish_cb, pipe_manual, pipe_voice],
- [pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player],
+ [pipe_raw, pipe_polished, pipe_player, pipeline_log, history_select, history_player],
queue=True,
)
synth_btn.click(
@@ -1294,7 +1360,7 @@ def build_app() -> gr.Blocks:
).then(
ui_synthesize,
[polished_text, tts_voice],
- [synth_log, output_audio, history_select, history_player],
+ [synth_log, output_player, history_select, history_player],
queue=True,
)
@@ -1335,6 +1401,24 @@ def create_fastapi_app():
headers={"Service-Worker-Allowed": "/"},
)
+ @fastapi_app.get("/outputs/{filename}")
+ async def serve_output_wav(filename: str):
+ """供 HTML 播放器直接加载 outputs 下的配音文件。"""
+ from fastapi import HTTPException
+
+ if (
+ ".." in filename
+ or "/" in filename
+ or "\\" in filename
+ or not filename.startswith("voiceover_")
+ or not filename.endswith(".wav")
+ ):
+ raise HTTPException(status_code=404)
+ path = OUTPUT_DIR / filename
+ if not path.is_file():
+ raise HTTPException(status_code=404)
+ return FileResponse(path, media_type="audio/wav", filename=filename)
+
blocks = build_app()
gr.mount_gradio_app(
fastapi_app,