From 7c50b13c5708d0af72b6c352629429a129715698 Mon Sep 17 00:00:00 2001 From: dekun Date: Fri, 12 Jun 2026 18:02:34 +0800 Subject: [PATCH] Fix TTS synthesis UI stuck on loading state Enable Gradio queue, immediate pending feedback, segment progress, and gr.update for Audio so long syntheses show logs and playback correctly. Co-authored-by: Cursor --- app.py | 67 +++++++++++++++++++++++++++++++++++++++++--------- tts_service.py | 6 +++++ 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/app.py b/app.py index 30732ea..68cbb07 100644 --- a/app.py +++ b/app.py @@ -141,14 +141,40 @@ def _short_synth_log(msg: str, ok: bool) -> str: return "✅ 配音完成。请用下方播放器试听、下载。" -def ui_synthesize(polished_text: str, voice_label: str) -> tuple[str, str | None]: - """【TTS 合成】生成最终 wav 配音文件。先更新日志再更新播放器,减轻闪屏。""" +def ui_synth_pending(polished_text: str) -> tuple[str, dict]: + """点击合成后立即反馈,避免长时间无日志更新被误认为卡死。""" + text = (polished_text or "").strip() + if not text: + return "请先完成 Gemma4 润色。", gr.update(value=None) + est_sec = max(20, len(text) // 10) + return ( + f"⏳ 配音合成中(约 {len(text)} 字,预计 {est_sec}–{est_sec + 45} 秒),请勿重复点击…", + gr.update(value=None), + ) + + +def ui_synthesize( + polished_text: str, + voice_label: str, + progress: gr.Progress = gr.Progress(), +) -> tuple[str, dict]: + """【TTS 合成】生成最终 wav 配音文件。""" if not polished_text or not polished_text.strip(): - return "请先完成 Gemma4 润色。", None + return "请先完成 Gemma4 润色。", gr.update(value=None) voice_id = label_to_voice_id(voice_label) - ok, msg, wav_path = generate_voice(polished_text, voice_id=voice_id) - return _short_synth_log(msg, ok), wav_path if ok else None + + def _report_segment(seg: int, total: int) -> None: + progress((seg - 1) / max(total, 1), desc=f"ChatTTS 第 {seg}/{total} 段…") + + ok, msg, wav_path = generate_voice( + polished_text, + voice_id=voice_id, + progress_callback=_report_segment, + ) + if ok: + return _short_synth_log(msg, ok), gr.update(value=wav_path) + return _short_synth_log(msg, ok), gr.update(value=None) # --------------------------------------------------------------------------- @@ -173,10 +199,10 @@ def ui_full_pipeline( else: path = _save_upload(audio_file) if not path: - return "", "", None, "❌ 请上传录音或手动填写转写文本。" + return "", "", gr.update(value=None), "❌ 请上传录音或手动填写转写文本。" ok, result = transcribe_audio(path) if not ok: - return "", "", None, f"❌ 识别失败: {result}" + return "", "", gr.update(value=None), f"❌ 识别失败: {result}" raw = result logs.append(f"✅ Whisper 识别完成({len(raw)} 字)。") @@ -187,7 +213,7 @@ def ui_full_pipeline( else: ok, result = polish_text(raw) if not ok: - return raw, "", None, f"❌ 润色失败: {result}\n" + "\n".join(logs) + return raw, "", gr.update(value=None), f"❌ 润色失败: {result}\n" + "\n".join(logs) polished = result logs.append(f"✅ Gemma4 润色完成({len(polished)} 字)。") @@ -195,10 +221,10 @@ def ui_full_pipeline( voice_id = label_to_voice_id(voice_label) ok, msg, wav_path = generate_voice(polished, voice_id=voice_id) if not ok: - return raw, polished, None, f"❌ 合成失败: {msg}\n" + "\n".join(logs) + return raw, polished, gr.update(value=None), f"❌ 合成失败: {msg}\n" + "\n".join(logs) logs.append(f"✅ {msg}") - return raw, polished, wav_path, "\n".join(logs) + return raw, polished, gr.update(value=wav_path), "\n".join(logs) # --------------------------------------------------------------------------- @@ -1116,10 +1142,15 @@ def build_app() -> gr.Blocks: transcribe_btn.click(ui_transcribe, rec_audio, [raw_text, transcribe_log]) polish_btn.click(ui_polish, raw_text, [polished_text, polish_log]) synth_btn.click( + ui_synth_pending, + [polished_text], + [synth_log, output_audio], + queue=True, + ).then( ui_synthesize, [polished_text, tts_voice], [synth_log, output_audio], - show_progress="hidden", + queue=True, ) # ---- Tab 3: 一键生产 ---- @@ -1156,12 +1187,26 @@ def build_app() -> gr.Blocks: pipe_polished = gr.Textbox(label="润色稿", lines=6) pipe_output = gr.Audio(label="成品配音", type="filepath") + def ui_pipeline_pending(skip_polish: bool, manual_raw: str) -> tuple[str, dict]: + if manual_raw and manual_raw.strip(): + return "⏳ 全流程运行中(识别/润色/合成),请稍候…", gr.update(value=None) + if skip_polish: + return "⏳ 全流程运行中(识别→合成),请稍候…", gr.update(value=None) + return "⏳ 全流程运行中(识别→润色→合成),请稍候…", gr.update(value=None) + pipeline_btn.click( + ui_pipeline_pending, + [skip_polish_cb, pipe_manual], + [pipeline_log, pipe_output], + queue=True, + ).then( ui_full_pipeline, [pipe_audio, skip_polish_cb, pipe_manual, pipe_voice], [pipe_raw, pipe_polished, pipe_output, pipeline_log], + queue=True, ) + demo.queue(default_concurrency_limit=1) return demo diff --git a/tts_service.py b/tts_service.py index 8a1d81c..586906c 100644 --- a/tts_service.py +++ b/tts_service.py @@ -635,6 +635,7 @@ def _concat_wavs( def generate_voice( refined_text: str, voice_id: str = "custom", + progress_callback=None, ) -> Tuple[bool, str, Optional[str]]: """ 使用 ChatTTS(本地 GPU)将润色稿合成为 wav。 @@ -724,6 +725,11 @@ def generate_voice( for idx, chunk in enumerate(chunks, start=1): if not chunk or len(chunk) < 2: continue + if progress_callback is not None: + try: + progress_callback(idx, len(chunks)) + except Exception: + logger.debug("TTS 进度回调失败", exc_info=True) release_cuda_cache() chunk_infer = replace(params_infer_code, manual_seed=42 + idx) wavs = None