diff --git a/app.py b/app.py index 5066aa2..6768426 100644 --- a/app.py +++ b/app.py @@ -5,13 +5,13 @@ Gradio Web 中控:音色锁定 → Whisper 识别 → Gemma4 润色 → ChatTT from __future__ import annotations -import inspect import logging import re import shutil import sys import uuid from pathlib import Path +from urllib.parse import quote import gradio as gr @@ -20,6 +20,7 @@ from config import ( HOST, MODEL_NAME, OLLAMA_URL, + OUTPUT_DIR, PORT, SPEAKER_EMB_PATH, UPLOAD_DIR, @@ -60,35 +61,56 @@ def ui_history_dropdown(select_path: str | None = None) -> dict: return gr.update(choices=choices, value=value) -def ui_history_play(filepath: str | None) -> dict: +def _voice_player_html(wav_path: str | None) -> str: + """带播放控件与语速滑块的 HTML 播放器(语速仅影响试听,不改变 WAV 文件)。""" + if not wav_path: + return ( + '
' + "

合成完成后可在此试听,拖动下方滑块调节播放语速(0.5x~2.0x)。

" + "
" + ) + path = Path(wav_path) + if not path.is_file(): + return ( + '
' + "

音频文件不存在,请重新合成或刷新历史列表。

" + "
" + ) + name = path.name + src = f"/outputs/{quote(name)}" + return f""" +
+
🎧 {name}
+ +
+ 播放语速 + + 1.00x + ⬇ 下载 WAV +
+

语速仅用于试听,下载的 WAV 仍为原速。

+
+""" + + +def ui_history_play(filepath: str | None) -> str: """选中历史条目后加载播放器。""" - if filepath and Path(filepath).is_file(): - return gr.update(value=filepath) - return gr.update(value=None) + return _voice_player_html(filepath) -def ui_initial_history() -> tuple[dict, dict]: +def ui_initial_history() -> tuple[dict, str]: """首屏加载历史列表并自动选中最新一条。""" choices = list_voice_history() paths = [p for _, p in choices] latest = paths[0] if paths else None - return gr.update(choices=choices, value=latest), ui_history_play(latest) + return gr.update(choices=choices, value=latest), _voice_player_html(latest) -def _tts_output_audio(label: str) -> gr.Audio: - """成品播放器:兼容 Gradio 4.x(无 show_download_button 等参数)。""" - kwargs: dict = { - "label": label, - "type": "filepath", - "interactive": False, - "elem_classes": ["tts-output-audio"], - } - params = inspect.signature(gr.Audio.__init__).parameters - if "show_download_button" in params: - kwargs["show_download_button"] = True - if "show_share_button" in params: - kwargs["show_share_button"] = False - return gr.Audio(**kwargs) +def _voice_player_block() -> gr.HTML: + """创建成品配音 HTML 播放器区域。""" + return gr.HTML(value=_voice_player_html(None), elem_classes=["tts-player-block"]) # --------------------------------------------------------------------------- @@ -181,8 +203,8 @@ def _short_synth_log(msg: str, ok: bool) -> str: segs = re.search(r"共\s*(\d+)\s*段", msg) if chars: seg_note = f",{segs.group(1)} 段拼接" if segs else "" - return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、下载。" - return "✅ 配音完成。请用下方播放器试听、下载。" + return f"✅ 配音完成({chars.group(1)} 字{seg_note})。请用下方播放器试听、调节语速或下载。" + return "✅ 配音完成。请用下方播放器试听、调节语速或下载。" def ui_synth_pending(polished_text: str) -> str: @@ -196,14 +218,14 @@ def ui_synth_pending(polished_text: str) -> str: ) -def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict, dict]: +def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, str, dict, str]: """【TTS 合成】生成最终 wav 配音文件。""" if not polished_text or not polished_text.strip(): return ( "请先完成 Gemma4 润色。", - gr.update(value=None), + _voice_player_html(None), ui_history_dropdown(), - gr.update(value=None), + _voice_player_html(None), ) voice_id = label_to_voice_id(voice_label) @@ -211,15 +233,15 @@ def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, dict, dict if ok: return ( _short_synth_log(msg, ok), - gr.update(value=wav_path), + _voice_player_html(wav_path), ui_history_dropdown(wav_path), - gr.update(value=wav_path), + _voice_player_html(wav_path), ) return ( _short_synth_log(msg, ok), - gr.update(value=None), + _voice_player_html(None), ui_history_dropdown(), - gr.update(value=None), + _voice_player_html(None), ) @@ -231,7 +253,7 @@ def ui_full_pipeline( skip_polish: bool, manual_raw: str, voice_label: str, -) -> tuple[str, str, dict, str, dict, dict]: +) -> tuple[str, str, str, str, dict, str]: """ 串联执行:识别 → 润色(可跳过)→ 合成。 返回 (raw, polished, wav_path, log) @@ -245,10 +267,10 @@ def ui_full_pipeline( else: path = _save_upload(audio_file) if not path: - return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), gr.update(value=None) + return "", "", _voice_player_html(None), "❌ 请上传录音或手动填写转写文本。", ui_history_dropdown(), _voice_player_html(None) ok, result = transcribe_audio(path) if not ok: - return "", "", gr.update(value=None), f"❌ 识别失败: {result}", ui_history_dropdown(), gr.update(value=None) + return "", "", _voice_player_html(None), f"❌ 识别失败: {result}", ui_history_dropdown(), _voice_player_html(None) raw = result logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。") @@ -259,7 +281,7 @@ def ui_full_pipeline( else: ok, result = polish_text(raw) if not ok: - return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None) + return raw, "", _voice_player_html(None), f"❌ 润色失败: {result}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None) polished = result logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。") @@ -267,16 +289,16 @@ def ui_full_pipeline( voice_id = label_to_voice_id(voice_label) ok, msg, wav_path = generate_voice(polished, voice_id=voice_id) if not ok: - return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), gr.update(value=None) + return raw, polished, _voice_player_html(None), f"❌ 合成失败: {msg}\n" + "\n".join(logs), ui_history_dropdown(), _voice_player_html(None) logs.append(f"✅ {msg}") return ( raw, polished, - gr.update(value=wav_path), + _voice_player_html(wav_path), "\n".join(logs), ui_history_dropdown(wav_path), - gr.update(value=wav_path), + _voice_player_html(wav_path), ) @@ -971,24 +993,68 @@ gradio-app, .gradio-container .waveform-container { background: #1a2332 !important; } -/* 成品播放器:去掉 Gradio 默认 focus 白框,减轻合成完成时闪一下 */ -.gradio-container .tts-output-audio, -.gradio-container .tts-output-audio .audio-container { +/* 成品 HTML 播放器:播放 + 语速滑块 */ +.tts-player-block { contain: layout style paint; } +.tts-player-wrap { + background: #1a2332 !important; border: 1px solid #374151 !important; - background: #1a2332 !important; - contain: strict; - min-height: 120px; + border-radius: 10px !important; + padding: 14px 16px !important; + margin: 8px 0 !important; } -.gradio-container .tts-output-audio button, -.gradio-container .tts-output-audio button:focus, -.gradio-container .tts-output-audio button:focus-visible { - outline: none !important; - box-shadow: none !important; - border-color: #4b5563 !important; +.tts-player-wrap.tts-player-empty { + color: #94a3b8 !important; + font-size: 0.92rem !important; + min-height: 72px; } -.gradio-container .tts-output-audio .wrap, -.gradio-container .tts-output-audio .controls { - background: #1a2332 !important; +.tts-player-title { + color: #93c5fd !important; + font-size: 0.88rem !important; + margin-bottom: 10px !important; + word-break: break-all; +} +.tts-player-wrap audio.tts-audio-el { + width: 100% !important; + height: 44px !important; + margin: 6px 0 12px 0 !important; + border-radius: 6px !important; +} +.tts-speed-row { + display: flex !important; + flex-wrap: wrap !important; + align-items: center !important; + gap: 10px 14px !important; + margin-top: 4px !important; +} +.tts-speed-label-text { + color: #e5e7eb !important; + font-size: 0.9rem !important; + min-width: 64px; +} +.tts-speed-slider { + flex: 1 1 120px !important; + min-width: 120px !important; + max-width: 280px !important; + accent-color: #2563eb !important; +} +.tts-speed-val { + color: #93c5fd !important; + font-weight: 600 !important; + min-width: 48px !important; +} +.tts-dl-btn { + color: #ffffff !important; + background: #374151 !important; + padding: 6px 12px !important; + border-radius: 6px !important; + text-decoration: none !important; + font-size: 0.85rem !important; +} +.tts-dl-btn:hover { background: #4b5563 !important; } +.tts-player-tip { + color: #64748b !important; + font-size: 0.78rem !important; + margin: 10px 0 0 0 !important; } .gradio-container .pipeline-step-card textarea { contain: layout style; @@ -1124,7 +1190,7 @@ def build_app() -> gr.Blocks: with gr.Row(elem_classes=["pipeline-output-row"]): pipe_raw = gr.Textbox(label="转写原文", lines=6) pipe_polished = gr.Textbox(label="润色稿", lines=6) - pipe_output = _tts_output_audio("成品配音") + pipe_player = _voice_player_block() # ---- Tab 2: 分步流水线 ---- with gr.Tab("分步流水线"): @@ -1171,7 +1237,7 @@ def build_app() -> gr.Blocks: ) synth_btn = gr.Button("🔊 合成配音 WAV", variant="primary") synth_log = gr.Textbox(label="合成日志", lines=3, interactive=False) - output_audio = _tts_output_audio("成品配音") + output_player = _voice_player_block() transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log]) polish_btn.click(ui_polish, raw_text, [polished_text, polish_log]) @@ -1262,7 +1328,7 @@ def build_app() -> gr.Blocks: scale=4, ) history_refresh_btn = gr.Button("🔄 刷新", scale=0, min_width=100) - history_player = _tts_output_audio("历史试听 / 下载") + history_player = _voice_player_block() history_refresh_btn.click(ui_history_dropdown, outputs=[history_select]) history_select.change(ui_history_play, history_select, history_player) @@ -1283,7 +1349,7 @@ def build_app() -> gr.Blocks: ).then( ui_full_pipeline, [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice], - [pipe_raw, pipe_polished, pipe_output, pipeline_log, history_select, history_player], + [pipe_raw, pipe_polished, pipe_player, pipeline_log, history_select, history_player], queue=True, ) synth_btn.click( @@ -1294,7 +1360,7 @@ def build_app() -> gr.Blocks: ).then( ui_synthesize, [polished_text, tts_voice], - [synth_log, output_audio, history_select, history_player], + [synth_log, output_player, history_select, history_player], queue=True, ) @@ -1335,6 +1401,24 @@ def create_fastapi_app(): headers={"Service-Worker-Allowed": "/"}, ) + @fastapi_app.get("/outputs/{filename}") + async def serve_output_wav(filename: str): + """供 HTML 播放器直接加载 outputs 下的配音文件。""" + from fastapi import HTTPException + + if ( + ".." in filename + or "/" in filename + or "\\" in filename + or not filename.startswith("voiceover_") + or not filename.endswith(".wav") + ): + raise HTTPException(status_code=404) + path = OUTPUT_DIR / filename + if not path.is_file(): + raise HTTPException(status_code=404) + return FileResponse(path, media_type="audio/wav", filename=filename) + blocks = build_app() gr.mount_gradio_app( fastapi_app,