""" Faster-Whisper CUDA 语音识别服务 封装本地 GPU 加速的音频转写逻辑,适配 RTX 3060 Ti 8GB 显存。 """ from __future__ import annotations import logging import traceback from typing import Optional, Tuple from config import ( WHISPER_COMPUTE_TYPE, WHISPER_DEVICE, WHISPER_LANGUAGE, WHISPER_MODEL_SIZE, ) logger = logging.getLogger(__name__) # 全局懒加载模型实例,避免 Gradio 重复初始化占用显存 _model = None _model_error: Optional[str] = None def _is_cuda_error(exc: BaseException) -> bool: """判断异常是否与 CUDA/GPU 相关。""" msg = str(exc).lower() cuda_keywords = ( "cuda", "cudnn", "cublas", "gpu", "out of memory", "no kernel image", "device-side assert", ) return any(k in msg for k in cuda_keywords) def get_whisper_model(): """ 获取或初始化 Faster-Whisper 模型。 强制 device=cuda, compute_type=float16。 """ global _model, _model_error if _model is not None: return _model, None if _model_error is not None: return None, _model_error try: from faster_whisper import WhisperModel logger.info( "正在加载 Whisper 模型: size=%s, device=%s, compute_type=%s", WHISPER_MODEL_SIZE, WHISPER_DEVICE, WHISPER_COMPUTE_TYPE, ) _model = WhisperModel( WHISPER_MODEL_SIZE, device=WHISPER_DEVICE, compute_type=WHISPER_COMPUTE_TYPE, ) logger.info("Whisper 模型加载成功。") return _model, None except ImportError as exc: _model_error = ( "未安装 faster-whisper,请执行: pip install faster-whisper\n" f"原始错误: {exc}" ) logger.exception("faster-whisper 导入失败") return None, _model_error except Exception as exc: if _is_cuda_error(exc): _model_error = ( "CUDA 初始化失败,请检查 NVIDIA 驱动、CUDA 运行时及 cuDNN 是否正确安装。\n" f"错误详情: {exc}\n" f"{traceback.format_exc()}" ) else: _model_error = f"Whisper 模型加载失败: {exc}\n{traceback.format_exc()}" logger.exception("Whisper 模型加载异常") return None, _model_error def transcribe_audio(audio_path: str) -> Tuple[bool, str]: """ 将音频文件转写为中文文本。 Args: audio_path: 本地音频文件绝对或相对路径 Returns: (success, text_or_error_message) """ if not audio_path: return False, "未提供音频文件路径。" model, init_error = get_whisper_model() if model is None: return False, init_error or "Whisper 模型不可用。" try: segments, info = model.transcribe( audio_path, language=WHISPER_LANGUAGE, beam_size=5, vad_filter=True, ) text_parts = [] for segment in segments: text_parts.append(segment.text.strip()) result_text = "".join(text_parts).strip() if not result_text: return False, ( "识别结果为空,请检查音频是否有效、音量是否足够," f"或尝试更换格式。检测到语言: {getattr(info, 'language', 'unknown')}" ) logger.info( "转写完成: 语言=%s, 概率=%.2f, 字数=%d", getattr(info, "language", "?"), getattr(info, "language_probability", 0.0), len(result_text), ) return True, result_text except Exception as exc: if _is_cuda_error(exc): err = ( "CUDA 推理异常:显存可能不足或 GPU 状态异常。" "建议关闭其他占用显存的进程后重试。\n" f"错误详情: {exc}" ) else: err = f"音频转写失败: {exc}\n{traceback.format_exc()}" logger.exception("transcribe_audio 失败") return False, err def reset_whisper_model() -> None: """释放模型引用(用于调试或显存回收)。""" global _model, _model_error _model = None _model_error = None