Fix ChatTTS Corrupt input data by correcting speaker params.
Use spk_smp plus txt_smp for voice clone instead of mis-encoding into spk_emb; migrate legacy speaker_emb.pt and improve error hints. Co-authored-by: Cursor <cursoragent@cursor.com>
This commit is contained in:
@@ -779,14 +779,31 @@ pm2 restart trading_studio
|
||||
2. 增大 `config.py` 中 `OLLAMA_TIMEOUT`
|
||||
3. 检查防火墙:`sudo ufw allow from 192.168.8.0/24 to any port 11434`(在 Ollama 节点)
|
||||
|
||||
### 10.5 ChatTTS 音色文件损坏
|
||||
### 10.5 ChatTTS 合成报 `Corrupt input data`
|
||||
|
||||
**原因:** 音色参数传错。`sample_audio_speaker()` 的结果应作为 **`spk_smp`**,不能同时误传给 **`spk_emb`**(LZMA 解压失败)。旧版 `speaker_emb.pt` 或未填参考转写时常见。
|
||||
|
||||
**处理:**
|
||||
|
||||
```bash
|
||||
rm /opt/Trading_Studio/speaker_emb.pt
|
||||
pm2 restart trading_studio
|
||||
```
|
||||
|
||||
在 Web UI「音色锁定」:
|
||||
|
||||
1. 上传 10–30 秒干净参考人声
|
||||
2. **填写与录音完全一致的「参考音频精确转写」**(必填)
|
||||
3. 重新点击「锁定音色」后再合成
|
||||
|
||||
### 10.6 ChatTTS 音色文件损坏
|
||||
|
||||
```bash
|
||||
rm speaker_emb.pt
|
||||
# 重新在 Web UI「音色锁定」上传参考人声
|
||||
# 重新在 Web UI「音色锁定」上传参考人声并填写转写
|
||||
```
|
||||
|
||||
### 10.6 端口 5683 被占用
|
||||
### 10.7 端口 5683 被占用
|
||||
|
||||
```bash
|
||||
sudo lsof -i :5683
|
||||
@@ -794,7 +811,7 @@ sudo lsof -i :5683
|
||||
ss -tlnp | grep 5683
|
||||
```
|
||||
|
||||
### 10.7 手机「找不到麦克风」
|
||||
### 10.8 手机「找不到麦克风」
|
||||
|
||||
内网 `http://192.168.x.x:5683` 下手机无法使用实时录音,属浏览器 HTTPS 安全限制。
|
||||
完整说明与 NPS 穿透方案见 [0.9 手机「找不到麦克风」](#09-手机找不到麦克风) 与 [PWA_NPS.md](./PWA_NPS.md) 第九节。
|
||||
|
||||
@@ -248,6 +248,9 @@ A: `http://内网IP:5683` 非 HTTPS,浏览器禁用麦克风。请按 [PWA_NPS
|
||||
**Q: TTS 音色不稳定?**
|
||||
A: 重新锁定音色,填写参考音频精确转写,并保持 `temperature=0.3` 低随机性。
|
||||
|
||||
**Q: 合成报 `Corrupt input data`?**
|
||||
A: 音色参数格式问题。删除 `speaker_emb.pt`,重新锁定音色并**填写参考音频精确转写**。详见 [DEPLOY.md §10.5](./DEPLOY.md)。
|
||||
|
||||
**Q: 合成音频为空或噪声?**
|
||||
A: 检查润色文本长度(过短可能导致异常),确认 `speaker_emb.pt` 存在且有效。
|
||||
|
||||
|
||||
@@ -900,7 +900,7 @@ def build_app() -> gr.Blocks:
|
||||
sources=["upload", "microphone"],
|
||||
)
|
||||
spk_transcript = gr.Textbox(
|
||||
label="参考音频精确转写(可选,提升还原度)",
|
||||
label="参考音频精确转写(强烈建议填写,与录音一致,避免合成报错)",
|
||||
placeholder="示例:今天开了三单,第一单手贱提前平了,第二单…",
|
||||
info="请尽量与参考音频内容完全一致,可提升音色还原度",
|
||||
lines=6,
|
||||
|
||||
+93
-18
@@ -266,15 +266,69 @@ def _get_audio_duration_sec(audio: np.ndarray, sample_rate: int) -> float:
|
||||
return len(audio) / float(sample_rate)
|
||||
|
||||
|
||||
def _encode_spk_emb(chat, tensor_or_str: Any) -> str:
|
||||
"""将 Speaker Embedding 编码为 ChatTTS 可用的字符串格式。"""
|
||||
if isinstance(tensor_or_str, str):
|
||||
return tensor_or_str
|
||||
|
||||
def _encode_random_spk_emb(chat, tensor: torch.Tensor) -> Optional[str]:
|
||||
"""将随机说话人向量编码为 spk_emb 字符串(仅用于 sample_random,非参考音频)。"""
|
||||
speaker = getattr(chat, "speaker", None)
|
||||
if speaker is not None and hasattr(speaker, "_encode"):
|
||||
return speaker._encode(tensor)
|
||||
if hasattr(chat, "_encode_spk_emb"):
|
||||
return chat._encode_spk_emb(tensor_or_str)
|
||||
return chat._encode_spk_emb(tensor)
|
||||
return None
|
||||
|
||||
return tensor_or_str
|
||||
|
||||
def _is_valid_spk_emb_string(chat, spk_emb: str) -> bool:
|
||||
"""spk_emb 与 spk_smp 编码不同;非法字符串会在 lzma 解压时报 Corrupt input data。"""
|
||||
speaker = getattr(chat, "speaker", None)
|
||||
if speaker is None or not hasattr(speaker, "_decode"):
|
||||
return False
|
||||
try:
|
||||
speaker._decode(spk_emb)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _normalize_speaker_for_infer(
|
||||
chat,
|
||||
payload: Dict[str, Any],
|
||||
) -> Tuple[Optional[Dict[str, Optional[str]]], Optional[str]]:
|
||||
"""
|
||||
规范 ChatTTS 音色参数。
|
||||
参考音频克隆必须用 spk_smp + txt_smp,不能把 sample_audio_speaker 结果传给 spk_emb。
|
||||
"""
|
||||
spk_smp = payload.get("spk_smp")
|
||||
txt_smp = (payload.get("txt_smp") or "").strip() or None
|
||||
spk_emb = payload.get("spk_emb")
|
||||
warn: Optional[str] = None
|
||||
|
||||
if spk_smp:
|
||||
if not txt_smp:
|
||||
warn = (
|
||||
"未填写参考音频转写(txt_smp),音色克隆可能不稳定。"
|
||||
"建议在「音色锁定」补充精确转写后重新锁定。"
|
||||
)
|
||||
return {"spk_smp": spk_smp, "txt_smp": txt_smp, "spk_emb": None}, warn
|
||||
|
||||
if isinstance(spk_emb, str) and spk_emb.strip():
|
||||
if _is_valid_spk_emb_string(chat, spk_emb):
|
||||
return {"spk_emb": spk_emb, "spk_smp": None, "txt_smp": None}, None
|
||||
# 旧版误存:把 spk_smp 写进了 spk_emb
|
||||
return {
|
||||
"spk_smp": spk_emb,
|
||||
"txt_smp": txt_smp,
|
||||
"spk_emb": None,
|
||||
}, (
|
||||
"检测到旧版音色文件格式,已自动按 spk_smp 加载。"
|
||||
"建议重新锁定音色并填写参考转写。"
|
||||
)
|
||||
|
||||
if isinstance(spk_emb, torch.Tensor):
|
||||
encoded = _encode_random_spk_emb(chat, spk_emb)
|
||||
if encoded:
|
||||
return {"spk_emb": encoded, "spk_smp": None, "txt_smp": None}, None
|
||||
return None, "旧版音色张量无法编码,请重新锁定音色。"
|
||||
|
||||
return None, "音色数据无效或已损坏,请重新锁定音色。"
|
||||
|
||||
|
||||
def save_fixed_speaker(
|
||||
@@ -313,10 +367,9 @@ def save_fixed_speaker(
|
||||
audio = audio[:max_samples]
|
||||
|
||||
spk_smp = chat.sample_audio_speaker(audio)
|
||||
spk_emb = _encode_spk_emb(chat, spk_smp)
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"spk_emb": spk_emb,
|
||||
"version": 2,
|
||||
"spk_smp": spk_smp,
|
||||
"txt_smp": sample_transcript.strip(),
|
||||
"created_at": datetime.now().isoformat(),
|
||||
@@ -330,7 +383,10 @@ def save_fixed_speaker(
|
||||
f"参考时长: {duration:.1f}s"
|
||||
)
|
||||
if not sample_transcript.strip():
|
||||
msg += "\n提示:填写参考音频精确转写可进一步提升音色还原度。"
|
||||
msg += (
|
||||
"\n⚠️ 未填写参考转写:合成时可能报 Corrupt input data 或音色不稳。"
|
||||
"请填写与录音一致的精确转写后重新锁定。"
|
||||
)
|
||||
|
||||
logger.info("Speaker Embedding 保存成功: %s", SPEAKER_EMB_PATH)
|
||||
return True, msg
|
||||
@@ -356,8 +412,11 @@ def _load_speaker_payload() -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
|
||||
chat, err = get_chattts_instance()
|
||||
if chat is None:
|
||||
return None, err
|
||||
encoded = _encode_random_spk_emb(chat, payload)
|
||||
if not encoded:
|
||||
return None, "旧版音色张量无法读取,请重新锁定音色。"
|
||||
return {
|
||||
"spk_emb": _encode_spk_emb(chat, payload),
|
||||
"spk_emb": encoded,
|
||||
"spk_smp": None,
|
||||
"txt_smp": "",
|
||||
}, None
|
||||
@@ -531,15 +590,17 @@ def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]:
|
||||
if not chunks:
|
||||
return False, "无法切分朗读文本,请检查润色稿内容。", None
|
||||
|
||||
spk_emb = payload.get("spk_emb")
|
||||
spk_smp = payload.get("spk_smp")
|
||||
txt_smp = payload.get("txt_smp", "")
|
||||
speaker_params, speaker_warn = _normalize_speaker_for_infer(chat, payload)
|
||||
if speaker_params is None:
|
||||
return False, speaker_warn or "音色参数无效,请重新锁定音色。", None
|
||||
if speaker_warn:
|
||||
logger.warning(speaker_warn)
|
||||
|
||||
params_infer_code = ChatTTS.Chat.InferCodeParams(
|
||||
prompt=TTS_SPEED_PROMPT,
|
||||
spk_emb=spk_emb,
|
||||
spk_smp=spk_smp if spk_smp else None,
|
||||
txt_smp=txt_smp if txt_smp else None,
|
||||
spk_emb=speaker_params.get("spk_emb"),
|
||||
spk_smp=speaker_params.get("spk_smp"),
|
||||
txt_smp=speaker_params.get("txt_smp"),
|
||||
temperature=TTS_TEMPERATURE,
|
||||
top_P=TTS_TOP_P,
|
||||
top_K=TTS_TOP_K,
|
||||
@@ -593,10 +654,24 @@ def generate_voice(refined_text: str) -> Tuple[bool, str, Optional[str]]:
|
||||
f"配音合成成功: {output_path}"
|
||||
f"(朗读 {len(speak_text)} 字{chunk_note})"
|
||||
)
|
||||
if speaker_warn:
|
||||
msg = f"{speaker_warn}\n{msg}"
|
||||
logger.info(msg)
|
||||
return True, msg, str(output_path)
|
||||
|
||||
except Exception as exc:
|
||||
err = f"语音合成失败: {exc}\n{traceback.format_exc()}"
|
||||
exc_msg = str(exc)
|
||||
if "Corrupt input data" in exc_msg:
|
||||
err = (
|
||||
"语音合成失败: 音色数据损坏或格式不兼容(Corrupt input data)。\n"
|
||||
"处理步骤:\n"
|
||||
"1. 删除旧音色: rm speaker_emb.pt\n"
|
||||
"2. 在「音色锁定」重新上传参考人声\n"
|
||||
"3. 填写与录音一致的「参考音频精确转写」(必填)\n"
|
||||
"4. 重新点击锁定音色后再合成\n"
|
||||
f"技术详情: {exc_msg}"
|
||||
)
|
||||
else:
|
||||
err = f"语音合成失败: {exc}\n{traceback.format_exc()}"
|
||||
logger.exception("generate_voice 失败")
|
||||
return False, err, None
|
||||
|
||||
Reference in New Issue
Block a user