Files
meijiaka-zy/python-api/app/services/tts_service.py
T
小鱼开发 bb08d0f586 refactor: 从智影 Fork 重构为智剪,独立 Docker 基础设施,开发模式认证兜底
主要变更:
- 修复 /tasks/script 路由 404(去掉重复 prefix)
- 开发模式自动认证兜底(无需登录即可测试流程)
- Docker 基础设施独立化(共用 db/redis)
- 前端 API 端口改为 8081
- 新增 TTS/语音克隆、视频粗剪、音频混音等智剪功能
- 删除智影专属模块(avatar、model_usage、qiniu 上传等)
2026-04-21 12:35:50 +08:00

260 lines
7.7 KiB
Python

"""
TTS 服务层
==========
封装 Kling AI TTS API,提供语音合成能力。
API 文档:https://klingai.com/document-api
"""
import asyncio
import logging
from pathlib import Path
from app.ai.providers.klingai_provider import KlingAIProvider
from app.config import get_settings
logger = logging.getLogger(__name__)
# Kling TTS API 配置
TTS_TASK_TIMEOUT = 120 # TTS 任务最大等待时间(秒)
TTS_POLL_INTERVAL = 2.0 # 轮询间隔(秒)
def _get_kling_provider() -> KlingAIProvider:
"""获取 KlingAI Provider 实例"""
settings = get_settings()
config = {
"access_key": settings.KLINGAI_ACCESS_KEY or "",
"secret_key": settings.KLINGAI_SECRET_KEY or "",
}
return KlingAIProvider(config)
class TTSService:
"""Kling AI TTS 服务客户端"""
# Kling 官方预设音色(已知音色)
PRESET_VOICES = [
{
"voice_id": "829824295735410756",
"name": "钓系女友",
"language": "zh",
"description": "甜美撒娇",
},
{
"voice_id": "829826751244537879",
"name": "温柔女声",
"language": "zh",
"description": "温柔细腻",
},
{
"voice_id": "829826792415842333",
"name": "播报男声",
"language": "zh",
"description": "沉稳播报",
},
{
"voice_id": "829826834144964676",
"name": "盐系少年",
"language": "zh",
"description": "清新少年",
},
{
"voice_id": "829826884271091753",
"name": "撒娇女友",
"language": "zh",
"description": "可爱撒娇",
},
]
def __init__(self) -> None:
self.provider = _get_kling_provider()
self.default_voice_id = "829826751244537879" # 温柔女声
async def synthesize_sync(
self,
text: str,
voice_id: str | None = None,
speed: float = 1.0,
voice_language: str = "zh",
) -> str:
"""
同步合成语音(提交任务并等待完成),返回音频 URL。
Args:
text: 待合成文本(≤1000字符)
voice_id: 音色 ID(默认使用温柔女声)
speed: 语速 (0.8-2.0)
voice_language: 语言 (zh/en)
Returns:
音频 URL
Raises:
ValueError: 参数校验失败
TimeoutError: 等待超时
"""
if not text or not text.strip():
raise ValueError("text 不能为空")
if len(text) > 1000:
raise ValueError("text 不能超过 1000 字符")
voice = voice_id or self.default_voice_id
# 提交 TTS 任务
result = await self.provider.generate_tts(
text=text,
voice_id=voice,
voice_language=voice_language,
voice_speed=speed,
)
task_id = result.get("task_id")
if not task_id:
raise ValueError("TTS 任务提交失败: 未返回 task_id")
logger.info(f"[TTS] 任务已提交: task_id={task_id}")
# 等待任务完成
audio_url = await self._wait_for_task(task_id)
return audio_url
async def _wait_for_task(self, task_id: str) -> str:
"""等待 TTS 任务完成并返回音频 URL"""
elapsed = 0.0
while elapsed < TTS_TASK_TIMEOUT:
await asyncio.sleep(TTS_POLL_INTERVAL)
elapsed += TTS_POLL_INTERVAL
result = await self.provider.get_tts_task(task_id)
status = result.get("status") or result.get("task_status", "")
logger.debug(f"[TTS] task_id={task_id}, status={status}, elapsed={elapsed}s")
if status == "succeed":
# 任务成功,返回音频 URL
task_result = result.get("task_result", {})
audio_url = task_result.get("audio_url") if isinstance(task_result, dict) else None
if audio_url:
return audio_url
# 某些响应格式直接放在 data 中
return result.get("audio_url") or result.get("data", {}).get("audio_url", "")
if status in ("failed", "error"):
raise ValueError(f"TTS 任务失败: {result.get('message', '未知错误')}")
raise TimeoutError(f"TTS 任务等待超时({TTS_TASK_TIMEOUT}秒)")
async def synthesize_to_file(
self,
text: str,
output_path: str | Path,
voice_id: str | None = None,
speed: float = 1.0,
voice_language: str = "zh",
) -> Path:
"""
合成语音并保存到文件。
Args:
text: 待合成文本
output_path: 输出文件路径
voice_id: 音色 ID
speed: 语速
voice_language: 语言
Returns:
输出文件路径
"""
import httpx
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# 获取音频 URL
audio_url = await self.synthesize_sync(
text=text,
voice_id=voice_id,
speed=speed,
voice_language=voice_language,
)
# 下载音频并保存
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.get(audio_url)
response.raise_for_status()
audio_bytes = response.content
output_path.write_bytes(audio_bytes)
logger.info(f"[TTS] 语音合成完成: {output_path}")
return output_path
async def batch_synthesize(
self,
segments: list[dict],
output_dir: str | Path,
voice_id: str | None = None,
speed: float = 1.0,
) -> list[dict]:
"""
批量合成多段语音。
Args:
segments: 分段列表,每项包含 text, index(可选), filename(可选)
output_dir: 输出目录
voice_id: 音色 ID
speed: 语速
Returns:
结果列表,每项包含 input(原始输入)和 output(输出文件路径或错误信息)
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
results = []
for seg in segments:
text = seg.get("text", "")
index = seg.get("index", len(results))
filename = seg.get("filename", f"audio_{index:04d}.mp3")
try:
output_path = await self.synthesize_to_file(
text=text,
output_path=output_dir / filename,
voice_id=voice_id,
speed=speed,
)
results.append({
"index": index,
"text": text,
"output_path": str(output_path),
"success": True,
"error": None,
})
except Exception as e:
logger.error(f"[TTS] 分段 {index} 合成失败: {e}")
results.append({
"index": index,
"text": text,
"output_path": None,
"success": False,
"error": str(e),
})
return results
@staticmethod
def get_preset_voices() -> list[dict]:
"""获取预设音色列表"""
return TTSService.PRESET_VOICES
@staticmethod
def get_voice_by_id(voice_id: str) -> dict | None:
"""根据 ID 获取音色信息"""
for voice in TTSService.PRESET_VOICES:
if voice["voice_id"] == voice_id:
return voice
return None