Fix ChatTTS Corrupt input data by correcting speaker params.

Use spk_smp plus txt_smp for voice clone instead of mis-encoding into spk_emb; migrate legacy speaker_emb.pt and improve error hints.

Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
dekun
2026-06-12 16:41:23 +08:00
parent f36056d293
commit 82f99c0b89
4 changed files with 118 additions and 23 deletions
+21 -4
View File
@@ -779,14 +779,31 @@ pm2 restart trading_studio
2. 增大 `config.py``OLLAMA_TIMEOUT` 2. 增大 `config.py``OLLAMA_TIMEOUT`
3. 检查防火墙:`sudo ufw allow from 192.168.8.0/24 to any port 11434`(在 Ollama 节点) 3. 检查防火墙:`sudo ufw allow from 192.168.8.0/24 to any port 11434`(在 Ollama 节点)
### 10.5 ChatTTS 音色文件损坏 ### 10.5 ChatTTS 合成报 `Corrupt input data`
**原因:** 音色参数传错。`sample_audio_speaker()` 的结果应作为 **`spk_smp`**,不能同时误传给 **`spk_emb`**LZMA 解压失败)。旧版 `speaker_emb.pt` 或未填参考转写时常见。
**处理:**
```bash
rm /opt/Trading_Studio/speaker_emb.pt
pm2 restart trading_studio
```
在 Web UI「音色锁定」:
1. 上传 10–30 秒干净参考人声
2. **填写与录音完全一致的「参考音频精确转写」**(必填)
3. 重新点击「锁定音色」后再合成
### 10.6 ChatTTS 音色文件损坏
```bash ```bash
rm speaker_emb.pt rm speaker_emb.pt
# 重新在 Web UI「音色锁定」上传参考人声 # 重新在 Web UI「音色锁定」上传参考人声并填写转写
``` ```
### 10.6 端口 5683 被占用 ### 10.7 端口 5683 被占用
```bash ```bash
sudo lsof -i :5683 sudo lsof -i :5683
@@ -794,7 +811,7 @@ sudo lsof -i :5683
ss -tlnp | grep 5683 ss -tlnp | grep 5683
``` ```
### 10.7 手机「找不到麦克风」 ### 10.8 手机「找不到麦克风」
内网 `http://192.168.x.x:5683` 下手机无法使用实时录音,属浏览器 HTTPS 安全限制。 内网 `http://192.168.x.x:5683` 下手机无法使用实时录音,属浏览器 HTTPS 安全限制。
完整说明与 NPS 穿透方案见 [0.9 手机「找不到麦克风」](#09-手机找不到麦克风) 与 [PWA_NPS.md](./PWA_NPS.md) 第九节。 完整说明与 NPS 穿透方案见 [0.9 手机「找不到麦克风」](#09-手机找不到麦克风) 与 [PWA_NPS.md](./PWA_NPS.md) 第九节。
+3
View File
@@ -248,6 +248,9 @@ A: `http://内网IP:5683` 非 HTTPS,浏览器禁用麦克风。请按 [PWA_NPS
**Q: TTS 音色不稳定?** **Q: TTS 音色不稳定?**
A: 重新锁定音色,填写参考音频精确转写,并保持 `temperature=0.3` 低随机性。 A: 重新锁定音色,填写参考音频精确转写,并保持 `temperature=0.3` 低随机性。
**Q: 合成报 `Corrupt input data`**
A: 音色参数格式问题。删除 `speaker_emb.pt`,重新锁定音色并**填写参考音频精确转写**。详见 [DEPLOY.md §10.5](./DEPLOY.md)。
**Q: 合成音频为空或噪声?** **Q: 合成音频为空或噪声?**
A: 检查润色文本长度(过短可能导致异常),确认 `speaker_emb.pt` 存在且有效。 A: 检查润色文本长度(过短可能导致异常),确认 `speaker_emb.pt` 存在且有效。
+1 -1
View File
@@ -900,7 +900,7 @@ def build_app() -> gr.Blocks:
sources=["upload", "microphone"], sources=["upload", "microphone"],
) )
spk_transcript = gr.Textbox( spk_transcript = gr.Textbox(
label="参考音频精确转写(可选,提升还原度", label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错",
placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…", placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…",
info="请尽量与参考音频内容完全一致,可提升音色还原度", info="请尽量与参考音频内容完全一致,可提升音色还原度",
lines=6, lines=6,
+92 -17
View File
@@ -266,15 +266,69 @@ def _get_audio_duration_sec(audio: np.ndarray, sample_rate: int) -> float:
return len(audio) / float(sample_rate) return len(audio) / float(sample_rate)
def _encode_spk_emb(chat, tensor_or_str: Any) -> str: def _encode_random_spk_emb(chat, tensor: torch.Tensor) -> Optional[str]:
""" Speaker Embedding 编码为 ChatTTS 可用的字符串格式""" """随机说话人向量编码为 spk_emb 字符串(仅用于 sample_random,非参考音频)"""
if isinstance(tensor_or_str, str): speaker = getattr(chat, "speaker", None)
return tensor_or_str if speaker is not None and hasattr(speaker, "_encode"):
return speaker._encode(tensor)
if hasattr(chat, "_encode_spk_emb"): if hasattr(chat, "_encode_spk_emb"):
return chat._encode_spk_emb(tensor_or_str) return chat._encode_spk_emb(tensor)
return None
return tensor_or_str
def _is_valid_spk_emb_string(chat, spk_emb: str) -> bool:
"""spk_emb 与 spk_smp 编码不同;非法字符串会在 lzma 解压时报 Corrupt input data。"""
speaker = getattr(chat, "speaker", None)
if speaker is None or not hasattr(speaker, "_decode"):
return False
try:
speaker._decode(spk_emb)
return True
except Exception:
return False
def _normalize_speaker_for_infer(
chat,
payload: Dict[str, Any],
) -> Tuple[Optional[Dict[str, Optional[str]]], Optional[str]]:
"""
规范 ChatTTS 音色参数。
参考音频克隆必须用 spk_smp + txt_smp,不能把 sample_audio_speaker 结果传给 spk_emb。
"""
spk_smp = payload.get("spk_smp")
txt_smp = (payload.get("txt_smp") or "").strip() or None
spk_emb = payload.get("spk_emb")
warn: Optional[str] = None
if spk_smp:
if not txt_smp:
warn = (
"未填写参考音频转写(txt_smp),音色克隆可能不稳定。"
"建议在「音色锁定」补充精确转写后重新锁定。"
)
return {"spk_smp": spk_smp, "txt_smp": txt_smp, "spk_emb": None}, warn
if isinstance(spk_emb, str) and spk_emb.strip():
if _is_valid_spk_emb_string(chat, spk_emb):
return {"spk_emb": spk_emb, "spk_smp": None, "txt_smp": None}, None
# 旧版误存:把 spk_smp 写进了 spk_emb
return {
"spk_smp": spk_emb,
"txt_smp": txt_smp,
"spk_emb": None,
}, (
"检测到旧版音色文件格式,已自动按 spk_smp 加载。"
"建议重新锁定音色并填写参考转写。"
)
if isinstance(spk_emb, torch.Tensor):
encoded = _encode_random_spk_emb(chat, spk_emb)
if encoded:
return {"spk_emb": encoded, "spk_smp": None, "txt_smp": None}, None
return None, "旧版音色张量无法编码,请重新锁定音色。"
return None, "音色数据无效或已损坏,请重新锁定音色。"
def save_fixed_speaker( def save_fixed_speaker(
@@ -313,10 +367,9 @@ def save_fixed_speaker(
audio = audio[:max_samples] audio = audio[:max_samples]
spk_smp = chat.sample_audio_speaker(audio) spk_smp = chat.sample_audio_speaker(audio)
spk_emb = _encode_spk_emb(chat, spk_smp)
payload: Dict[str, Any] = { payload: Dict[str, Any] = {
"spk_emb": spk_emb, "version": 2,
"spk_smp": spk_smp, "spk_smp": spk_smp,
"txt_smp": sample_transcript.strip(), "txt_smp": sample_transcript.strip(),
"created_at": datetime.now().isoformat(), "created_at": datetime.now().isoformat(),
@@ -330,7 +383,10 @@ def save_fixed_speaker(
f"参考时长: {duration:.1f}s" f"参考时长: {duration:.1f}s"
) )
if not sample_transcript.strip(): if not sample_transcript.strip():
msg += "\n提示:填写参考音频精确转写可进一步提升音色还原度。" msg += (
"\n⚠️ 未填写参考转写:合成时可能报 Corrupt input data 或音色不稳。"
"请填写与录音一致的精确转写后重新锁定。"
)
logger.info("Speaker Embedding 保存成功: %s", SPEAKER_EMB_PATH) logger.info("Speaker Embedding 保存成功: %s", SPEAKER_EMB_PATH)
return True, msg return True, msg
@@ -356,8 +412,11 @@ def _load_speaker_payload() -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
chat, err = get_chattts_instance() chat, err = get_chattts_instance()
if chat is None: if chat is None:
return None, err return None, err
encoded = _encode_random_spk_emb(chat, payload)
if not encoded:
return None, "旧版音色张量无法读取,请重新锁定音色。"
return { return {
"spk_emb": _encode_spk_emb(chat, payload), "spk_emb": encoded,
"spk_smp": None, "spk_smp": None,
"txt_smp": "", "txt_smp": "",
}, None }, None
@@ -531,15 +590,17 @@ def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]:
if not chunks: if not chunks:
return False, "无法切分朗读文本,请检查润色稿内容。", None return False, "无法切分朗读文本,请检查润色稿内容。", None
spk_emb = payload.get("spk_emb") speaker_params, speaker_warn = _normalize_speaker_for_infer(chat, payload)
spk_smp = payload.get("spk_smp") if speaker_params is None:
txt_smp = payload.get("txt_smp", "") return False, speaker_warn or "音色参数无效,请重新锁定音色。", None
if speaker_warn:
logger.warning(speaker_warn)
params_infer_code = ChatTTS.Chat.InferCodeParams( params_infer_code = ChatTTS.Chat.InferCodeParams(
prompt=TTS_SPEED_PROMPT, prompt=TTS_SPEED_PROMPT,
spk_emb=spk_emb, spk_emb=speaker_params.get("spk_emb"),
spk_smp=spk_smp if spk_smp else None, spk_smp=speaker_params.get("spk_smp"),
txt_smp=txt_smp if txt_smp else None, txt_smp=speaker_params.get("txt_smp"),
temperature=TTS_TEMPERATURE, temperature=TTS_TEMPERATURE,
top_P=TTS_TOP_P, top_P=TTS_TOP_P,
top_K=TTS_TOP_K, top_K=TTS_TOP_K,
@@ -593,10 +654,24 @@ def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]:
f"配音合成成功: {output_path}" f"配音合成成功: {output_path}"
f"(朗读 {len(speak_text)}{chunk_note}" f"(朗读 {len(speak_text)}{chunk_note}"
) )
if speaker_warn:
msg = f"{speaker_warn}\n{msg}"
logger.info(msg) logger.info(msg)
return True, msg, str(output_path) return True, msg, str(output_path)
except Exception as exc: except Exception as exc:
exc_msg = str(exc)
if "Corrupt input data" in exc_msg:
err = (
"语音合成失败: 音色数据损坏或格式不兼容(Corrupt input data)。\n"
"处理步骤:\n"
"1. 删除旧音色: rm speaker_emb.pt\n"
"2. 在「音色锁定」重新上传参考人声\n"
"3. 填写与录音一致的「参考音频精确转写」(必填)\n"
"4. 重新点击锁定音色后再合成\n"
f"技术详情: {exc_msg}"
)
else:
err = f"语音合成失败: {exc}\n{traceback.format_exc()}" err = f"语音合成失败: {exc}\n{traceback.format_exc()}"
logger.exception("generate_voice 失败") logger.exception("generate_voice 失败")
return False, err, None return False, err, None