From 82f99c0b8988fe2a946f39034d3f3a37214b6b84 Mon Sep 17 00:00:00 2001 From: dekun Date: Fri, 12 Jun 2026 16:41:23 +0800 Subject: [PATCH] Fix ChatTTS Corrupt input data by correcting speaker params. Use spk_smp plus txt_smp for voice clone instead of mis-encoding into spk_emb; migrate legacy speaker_emb.pt and improve error hints. Co-authored-by: Cursor --- DEPLOY.md | 25 +++++++++-- README.md | 3 ++ app.py | 2 +- tts_service.py | 111 +++++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 118 insertions(+), 23 deletions(-) diff --git a/DEPLOY.md b/DEPLOY.md index e284293..7bec072 100644 --- a/DEPLOY.md +++ b/DEPLOY.md @@ -779,14 +779,31 @@ pm2 restart trading_studio 2. 增大 `config.py` 中 `OLLAMA_TIMEOUT` 3. 检查防火墙:`sudo ufw allow from 192.168.8.0/24 to any port 11434`(在 Ollama 节点) -### 10.5 ChatTTS 音色文件损坏 +### 10.5 ChatTTS 合成报 `Corrupt input data` + +**原因:** 音色参数传错。`sample_audio_speaker()` 的结果应作为 **`spk_smp`**,不能同时误传给 **`spk_emb`**(LZMA 解压失败)。旧版 `speaker_emb.pt` 或未填参考转写时常见。 + +**处理:** + +```bash +rm /opt/Trading_Studio/speaker_emb.pt +pm2 restart trading_studio +``` + +在 Web UI「音色锁定」: + +1. 上传 10–30 秒干净参考人声 +2. **填写与录音完全一致的「参考音频精确转写」**(必填) +3. 重新点击「锁定音色」后再合成 + +### 10.6 ChatTTS 音色文件损坏 ```bash rm speaker_emb.pt -# 重新在 Web UI「音色锁定」上传参考人声 +# 重新在 Web UI「音色锁定」上传参考人声并填写转写 ``` -### 10.6 端口 5683 被占用 +### 10.7 端口 5683 被占用 ```bash sudo lsof -i :5683 @@ -794,7 +811,7 @@ sudo lsof -i :5683 ss -tlnp | grep 5683 ``` -### 10.7 手机「找不到麦克风」 +### 10.8 手机「找不到麦克风」 内网 `http://192.168.x.x:5683` 下手机无法使用实时录音,属浏览器 HTTPS 安全限制。 完整说明与 NPS 穿透方案见 [0.9 手机「找不到麦克风」](#09-手机找不到麦克风) 与 [PWA_NPS.md](./PWA_NPS.md) 第九节。 diff --git a/README.md b/README.md index 4969c4e..5cfbc4e 100644 --- a/README.md +++ b/README.md @@ -248,6 +248,9 @@ A: `http://内网IP:5683` 非 HTTPS,浏览器禁用麦克风。请按 [PWA_NPS **Q: TTS 音色不稳定?** A: 重新锁定音色,填写参考音频精确转写,并保持 `temperature=0.3` 低随机性。 +**Q: 合成报 `Corrupt input data`?** +A: 音色参数格式问题。删除 `speaker_emb.pt`,重新锁定音色并**填写参考音频精确转写**。详见 [DEPLOY.md §10.5](./DEPLOY.md)。 + **Q: 合成音频为空或噪声?** A: 检查润色文本长度(过短可能导致异常),确认 `speaker_emb.pt` 存在且有效。 diff --git a/app.py b/app.py index ba3810d..f7aad87 100644 --- a/app.py +++ b/app.py @@ -900,7 +900,7 @@ def build_app() -> gr.Blocks: sources=["upload", "microphone"], ) spk_transcript = gr.Textbox( - label="参考音频精确转写(可选,提升还原度)", + label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错)", placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…", info="请尽量与参考音频内容完全一致,可提升音色还原度", lines=6, diff --git a/tts_service.py b/tts_service.py index 5db40b3..46ee78f 100644 --- a/tts_service.py +++ b/tts_service.py @@ -266,15 +266,69 @@ def _get_audio_duration_sec(audio: np.ndarray, sample_rate: int) -> float: return len(audio) / float(sample_rate) -def _encode_spk_emb(chat, tensor_or_str: Any) -> str: - """将 Speaker Embedding 编码为 ChatTTS 可用的字符串格式。""" - if isinstance(tensor_or_str, str): - return tensor_or_str - +def _encode_random_spk_emb(chat, tensor: torch.Tensor) -> Optional[str]: + """将随机说话人向量编码为 spk_emb 字符串(仅用于 sample_random,非参考音频)。""" + speaker = getattr(chat, "speaker", None) + if speaker is not None and hasattr(speaker, "_encode"): + return speaker._encode(tensor) if hasattr(chat, "_encode_spk_emb"): - return chat._encode_spk_emb(tensor_or_str) + return chat._encode_spk_emb(tensor) + return None - return tensor_or_str + +def _is_valid_spk_emb_string(chat, spk_emb: str) -> bool: + """spk_emb 与 spk_smp 编码不同;非法字符串会在 lzma 解压时报 Corrupt input data。""" + speaker = getattr(chat, "speaker", None) + if speaker is None or not hasattr(speaker, "_decode"): + return False + try: + speaker._decode(spk_emb) + return True + except Exception: + return False + + +def _normalize_speaker_for_infer( + chat, + payload: Dict[str, Any], +) -> Tuple[Optional[Dict[str, Optional[str]]], Optional[str]]: + """ + 规范 ChatTTS 音色参数。 + 参考音频克隆必须用 spk_smp + txt_smp,不能把 sample_audio_speaker 结果传给 spk_emb。 + """ + spk_smp = payload.get("spk_smp") + txt_smp = (payload.get("txt_smp") or "").strip() or None + spk_emb = payload.get("spk_emb") + warn: Optional[str] = None + + if spk_smp: + if not txt_smp: + warn = ( + "未填写参考音频转写(txt_smp),音色克隆可能不稳定。" + "建议在「音色锁定」补充精确转写后重新锁定。" + ) + return {"spk_smp": spk_smp, "txt_smp": txt_smp, "spk_emb": None}, warn + + if isinstance(spk_emb, str) and spk_emb.strip(): + if _is_valid_spk_emb_string(chat, spk_emb): + return {"spk_emb": spk_emb, "spk_smp": None, "txt_smp": None}, None + # 旧版误存:把 spk_smp 写进了 spk_emb + return { + "spk_smp": spk_emb, + "txt_smp": txt_smp, + "spk_emb": None, + }, ( + "检测到旧版音色文件格式,已自动按 spk_smp 加载。" + "建议重新锁定音色并填写参考转写。" + ) + + if isinstance(spk_emb, torch.Tensor): + encoded = _encode_random_spk_emb(chat, spk_emb) + if encoded: + return {"spk_emb": encoded, "spk_smp": None, "txt_smp": None}, None + return None, "旧版音色张量无法编码,请重新锁定音色。" + + return None, "音色数据无效或已损坏,请重新锁定音色。" def save_fixed_speaker( @@ -313,10 +367,9 @@ def save_fixed_speaker( audio = audio[:max_samples] spk_smp = chat.sample_audio_speaker(audio) - spk_emb = _encode_spk_emb(chat, spk_smp) payload: Dict[str, Any] = { - "spk_emb": spk_emb, + "version": 2, "spk_smp": spk_smp, "txt_smp": sample_transcript.strip(), "created_at": datetime.now().isoformat(), @@ -330,7 +383,10 @@ def save_fixed_speaker( f"参考时长: {duration:.1f}s" ) if not sample_transcript.strip(): - msg += "\n提示:填写参考音频精确转写可进一步提升音色还原度。" + msg += ( + "\n⚠️ 未填写参考转写:合成时可能报 Corrupt input data 或音色不稳。" + "请填写与录音一致的精确转写后重新锁定。" + ) logger.info("Speaker Embedding 保存成功: %s", SPEAKER_EMB_PATH) return True, msg @@ -356,8 +412,11 @@ def _load_speaker_payload() -> Tuple[Optional[Dict[str, Any]], Optional[str]]: chat, err = get_chattts_instance() if chat is None: return None, err + encoded = _encode_random_spk_emb(chat, payload) + if not encoded: + return None, "旧版音色张量无法读取,请重新锁定音色。" return { - "spk_emb": _encode_spk_emb(chat, payload), + "spk_emb": encoded, "spk_smp": None, "txt_smp": "", }, None @@ -531,15 +590,17 @@ def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]: if not chunks: return False, "无法切分朗读文本,请检查润色稿内容。", None - spk_emb = payload.get("spk_emb") - spk_smp = payload.get("spk_smp") - txt_smp = payload.get("txt_smp", "") + speaker_params, speaker_warn = _normalize_speaker_for_infer(chat, payload) + if speaker_params is None: + return False, speaker_warn or "音色参数无效,请重新锁定音色。", None + if speaker_warn: + logger.warning(speaker_warn) params_infer_code = ChatTTS.Chat.InferCodeParams( prompt=TTS_SPEED_PROMPT, - spk_emb=spk_emb, - spk_smp=spk_smp if spk_smp else None, - txt_smp=txt_smp if txt_smp else None, + spk_emb=speaker_params.get("spk_emb"), + spk_smp=speaker_params.get("spk_smp"), + txt_smp=speaker_params.get("txt_smp"), temperature=TTS_TEMPERATURE, top_P=TTS_TOP_P, top_K=TTS_TOP_K, @@ -593,10 +654,24 @@ def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]: f"配音合成成功: {output_path}" f"(朗读 {len(speak_text)} 字{chunk_note})" ) + if speaker_warn: + msg = f"{speaker_warn}\n{msg}" logger.info(msg) return True, msg, str(output_path) except Exception as exc: - err = f"语音合成失败: {exc}\n{traceback.format_exc()}" + exc_msg = str(exc) + if "Corrupt input data" in exc_msg: + err = ( + "语音合成失败: 音色数据损坏或格式不兼容(Corrupt input data)。\n" + "处理步骤:\n" + "1. 删除旧音色: rm speaker_emb.pt\n" + "2. 在「音色锁定」重新上传参考人声\n" + "3. 填写与录音一致的「参考音频精确转写」(必填)\n" + "4. 重新点击锁定音色后再合成\n" + f"技术详情: {exc_msg}" + ) + else: + err = f"语音合成失败: {exc}\n{traceback.format_exc()}" logger.exception("generate_voice 失败") return False, err, None