555 lines
18 KiB
Python
555 lines
18 KiB
Python
"""
|
||
语音合成与克隆 API 路由
|
||
=======================
|
||
|
||
提供 TTS 语音合成、批量合成、声音克隆等功能。
|
||
基于 MiniMax TTS 和声音克隆 API。
|
||
(Kling AI 语音相关代码保留但已废弃,仅视频/形象克隆仍使用 Kling)
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
import tempfile
|
||
import uuid
|
||
from pathlib import Path
|
||
|
||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||
from pydantic import BaseModel, Field
|
||
|
||
from app.schemas.common import ApiResponse, success_response
|
||
from app.services.qiniu_service import QiniuService
|
||
from app.services.vidu_tts_service import ViduTTSService
|
||
from app.services.minimax_tts_service import MiniMaxTTSService # noqa: F401 历史兼容
|
||
from app.services.tts_service import TTSService # noqa: F401 历史兼容
|
||
from app.services.voice_clone_service import VoiceCloneService # noqa: F401 历史兼容
|
||
|
||
logger = logging.getLogger(__name__)
|
||
router = APIRouter(prefix="/voice", tags=["Voice"])
|
||
|
||
|
||
# ========== 请求/响应模型 ==========
|
||
|
||
|
||
class TTSSynthesizeRequest(BaseModel):
|
||
"""TTS 合成请求"""
|
||
|
||
text: str = Field(..., min_length=1, max_length=10000, description="待合成文本(≤10000字符)")
|
||
voice_id: str | None = Field(None, description="音色 ID(默认:甜美女性)")
|
||
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="语速 0.5-2.0")
|
||
voice_language: str = Field(default="zh", description="音色语种 (zh/en)")
|
||
volume: int = Field(default=0, ge=0, le=10, description="音量 0-10(0=正常)")
|
||
pitch: int = Field(default=0, ge=-12, le=12, description="音调 -12 到 12")
|
||
|
||
|
||
class TTSBatchSegment(BaseModel):
|
||
"""批量合成段落"""
|
||
|
||
text: str = Field(..., min_length=1, description="段落文本")
|
||
index: int = Field(default=0, ge=0, description="段落序号")
|
||
filename: str | None = Field(None, description="输出文件名(不含扩展名)")
|
||
|
||
|
||
class TTSBatchRequest(BaseModel):
|
||
"""批量 TTS 合成请求"""
|
||
|
||
segments: list[TTSBatchSegment] = Field(..., min_length=1, description="段落列表")
|
||
voice_id: str | None = Field(None, description="音色 ID")
|
||
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="语速")
|
||
volume: int = Field(default=0, ge=0, le=10, description="音量 0-10")
|
||
pitch: int = Field(default=0, ge=-12, le=12, description="音调 -12 到 12")
|
||
|
||
|
||
class VoiceCloneSubmitRequest(BaseModel):
|
||
"""声音克隆提交请求"""
|
||
|
||
source_audio_url: str | None = Field(None, description="源音频 URL(5-30秒,mp3/wav,需公开可访问)")
|
||
source_video_url: str | None = Field(None, description="源视频 URL(可选)")
|
||
video_id: str | None = Field(None, description="历史作品ID(可选)")
|
||
voice_name: str | None = Field(None, description="自定义音色名称(≤20字符)")
|
||
|
||
|
||
class TTSBatchResponse(BaseModel):
|
||
"""批量合成结果"""
|
||
|
||
total: int
|
||
success_count: int
|
||
failed_count: int
|
||
results: list[dict]
|
||
|
||
|
||
class VoiceCloneTaskResponse(BaseModel):
|
||
"""克隆任务响应"""
|
||
|
||
task_id: str
|
||
status: str
|
||
voice_id: str | None = None
|
||
trial_url: str | None = None
|
||
error_message: str | None = None
|
||
|
||
|
||
class VoiceUploadResponse(BaseModel):
|
||
"""音频上传响应"""
|
||
|
||
url: str = Field(..., description="七牛云访问 URL")
|
||
key: str = Field(..., description="存储 Key")
|
||
|
||
|
||
class VoiceInfo(BaseModel):
|
||
"""音色信息"""
|
||
|
||
voice_id: str
|
||
name: str
|
||
description: str = ""
|
||
language: str = "zh"
|
||
recommended: bool = False
|
||
previewUrl: str | None = None
|
||
|
||
|
||
class LipSyncRequest(BaseModel):
|
||
"""对口型请求"""
|
||
|
||
video_url: str = Field(..., description="原视频 URL")
|
||
audio_url: str | None = Field(None, description="音频 URL(与 text 二选一)")
|
||
text: str | None = Field(None, description="文本内容(与 audio_url 二选一)")
|
||
voice_id: str | None = Field(None, description="音色 ID(文字驱动时生效)")
|
||
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="语速")
|
||
volume: int = Field(default=0, ge=0, le=10, description="音量")
|
||
ref_photo_url: str | None = Field(None, description="人脸参考图 URL")
|
||
|
||
|
||
class LipSyncResponse(BaseModel):
|
||
"""对口型响应"""
|
||
|
||
task_id: str
|
||
state: str
|
||
|
||
|
||
class LipSyncQueryResponse(BaseModel):
|
||
"""对口型查询响应"""
|
||
|
||
task_id: str
|
||
state: str
|
||
video_url: str | None = None
|
||
cover_url: str | None = None
|
||
|
||
|
||
# ========== API 路由 ==========
|
||
|
||
|
||
@router.post("/upload", response_model=ApiResponse[VoiceUploadResponse])
|
||
async def upload_voice_file(
|
||
file: UploadFile = File(...),
|
||
file_type: str = Form(default="audio", description="文件类型: audio | video"),
|
||
):
|
||
"""
|
||
上传音频/视频文件到七牛云
|
||
|
||
接收音频(mp3/wav)或视频(mp4/mov)文件,上传至七牛云 media bucket,
|
||
返回公开访问 URL。
|
||
"""
|
||
try:
|
||
file_type = file_type.lower().strip()
|
||
if file_type not in ("audio", "video"):
|
||
raise HTTPException(status_code=400, detail="file_type 必须是 audio 或 video")
|
||
|
||
# 根据类型校验 MIME
|
||
if file_type == "audio":
|
||
allowed_types = {"audio/mpeg", "audio/mp3", "audio/wav", "audio/x-wav", "audio/mp4"}
|
||
max_size = 20 * 1024 * 1024 # 20MB
|
||
prefix = "meijiaka-zj/voice"
|
||
type_label = "音频"
|
||
else:
|
||
allowed_types = {"video/mp4", "video/quicktime"}
|
||
max_size = 200 * 1024 * 1024 # 200MB
|
||
prefix = "meijiaka-zj/avatar"
|
||
type_label = "视频"
|
||
|
||
content_type = file.content_type or "application/octet-stream"
|
||
if content_type not in allowed_types:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"不支持的{type_label}格式: {content_type},仅支持 {', '.join(allowed_types)}",
|
||
)
|
||
|
||
# 读取文件内容
|
||
content = await file.read()
|
||
if len(content) > max_size:
|
||
raise HTTPException(status_code=400, detail=f"{type_label}文件大小不能超过 {max_size // 1024 // 1024}MB")
|
||
|
||
# 生成存储 key
|
||
ext = content_type.split("/")[-1].replace("quicktime", "mov").replace("mpeg", "mp3")
|
||
key = f"{prefix}/{uuid.uuid4().hex}.{ext}"
|
||
|
||
# 上传到七牛云
|
||
qiniu = QiniuService()
|
||
from io import BytesIO
|
||
|
||
qiniu.upload_stream(
|
||
stream=BytesIO(content),
|
||
key=key,
|
||
mime_type=content_type,
|
||
)
|
||
|
||
# 获取公开 URL(media bucket 使用 video_domain)
|
||
url = qiniu.get_file_url(qiniu.video_domain, key)
|
||
|
||
return success_response(
|
||
data=VoiceUploadResponse(url=url, key=key),
|
||
message="上传成功",
|
||
)
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"[Voice] 上传失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"上传失败: {str(e)}")
|
||
|
||
|
||
@router.get("/voices", response_model=ApiResponse[list[VoiceInfo]])
|
||
async def list_voices():
|
||
"""
|
||
获取可用音色列表
|
||
|
||
返回预设的音色选项,用户可选择喜欢的音色进行 TTS 合成。
|
||
"""
|
||
voices = ViduTTSService.get_preset_voices()
|
||
return success_response(
|
||
data=[VoiceInfo(**v) for v in voices],
|
||
message="获取音色列表成功",
|
||
)
|
||
|
||
|
||
@router.get("/preset-voices/raw", response_model=ApiResponse[list[dict]])
|
||
async def list_preset_voices_raw():
|
||
"""
|
||
【已废弃】KlingAI 官方预置音色列表
|
||
|
||
语音功能已迁移至 Vidu,此端点保留仅作历史兼容。
|
||
"""
|
||
return success_response(
|
||
data=[],
|
||
message="语音功能已迁移至 Vidu,请使用 /voices 获取音色列表",
|
||
)
|
||
|
||
|
||
@router.post("/synthesize", response_model=ApiResponse[dict])
|
||
async def synthesize_speech(request: TTSSynthesizeRequest):
|
||
"""
|
||
同步 TTS 合成
|
||
|
||
将文本转换为语音,返回音频 URL。
|
||
适用于短文本(≤1000字),长文本建议使用 /synthesize-batch。
|
||
"""
|
||
try:
|
||
service = ViduTTSService()
|
||
audio_url = await service.synthesize_sync(
|
||
text=request.text,
|
||
voice_id=request.voice_id,
|
||
speed=request.speed,
|
||
volume=request.volume,
|
||
pitch=request.pitch,
|
||
)
|
||
|
||
return success_response(
|
||
data={
|
||
"audio_url": audio_url,
|
||
"format": "mp3",
|
||
"text": request.text,
|
||
"voice_id": request.voice_id or ViduTTSService.DEFAULT_VOICE_ID,
|
||
},
|
||
message="合成成功",
|
||
)
|
||
|
||
except ValueError as e:
|
||
logger.warning(f"[Voice] TTS 参数错误: {e}")
|
||
raise HTTPException(status_code=422, detail=str(e))
|
||
except Exception as e:
|
||
logger.error(f"[Voice] TTS 合成失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"合成失败: {str(e)}")
|
||
|
||
|
||
@router.post("/synthesize-batch", response_model=ApiResponse[TTSBatchResponse])
|
||
async def synthesize_batch(request: TTSBatchRequest):
|
||
"""
|
||
批量 TTS 合成
|
||
|
||
将多段文本批量转换为语音,保存到临时目录。
|
||
适用于长文本分段合成场景。
|
||
"""
|
||
try:
|
||
# 使用系统临时目录
|
||
output_dir = Path(tempfile.gettempdir()) / "meijiaka-zj_tts"
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
segments_data = [s.model_dump() for s in request.segments]
|
||
|
||
service = ViduTTSService()
|
||
# Vidu 暂不支持批量合成,逐段调用
|
||
results = []
|
||
for seg in segments_data:
|
||
try:
|
||
audio_url = await service.synthesize_sync(
|
||
text=seg["text"],
|
||
voice_id=request.voice_id,
|
||
speed=request.speed,
|
||
volume=request.volume,
|
||
pitch=request.pitch,
|
||
)
|
||
results.append({
|
||
"index": seg.get("index", 0),
|
||
"success": True,
|
||
"audio_url": audio_url,
|
||
"filename": seg.get("filename"),
|
||
})
|
||
except Exception as e:
|
||
results.append({
|
||
"index": seg.get("index", 0),
|
||
"success": False,
|
||
"error": str(e),
|
||
"filename": seg.get("filename"),
|
||
})
|
||
|
||
success_count = sum(1 for r in results if r["success"])
|
||
failed_count = len(results) - success_count
|
||
|
||
return success_response(
|
||
data=TTSBatchResponse(
|
||
total=len(results),
|
||
success_count=success_count,
|
||
failed_count=failed_count,
|
||
results=results,
|
||
),
|
||
message=f"批量合成完成:成功 {success_count} 段,失败 {failed_count} 段",
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Voice] 批量 TTS 失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"批量合成失败: {str(e)}")
|
||
|
||
|
||
@router.post("/synthesize-file", response_model=ApiResponse[dict])
|
||
async def synthesize_to_file(request: TTSSynthesizeRequest, output_path: str):
|
||
"""
|
||
TTS 合成并保存到指定路径
|
||
|
||
将文本转换为语音并保存到指定文件路径。
|
||
"""
|
||
try:
|
||
service = ViduTTSService()
|
||
audio_url = await service.synthesize_sync(
|
||
text=request.text,
|
||
voice_id=request.voice_id,
|
||
speed=request.speed,
|
||
volume=request.volume,
|
||
pitch=request.pitch,
|
||
)
|
||
|
||
# 下载音频并保存到指定路径
|
||
import httpx
|
||
async with httpx.AsyncClient() as client:
|
||
response = await client.get(audio_url)
|
||
response.raise_for_status()
|
||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||
Path(output_path).write_bytes(response.content)
|
||
|
||
return success_response(
|
||
data={
|
||
"file_path": output_path,
|
||
"text": request.text,
|
||
"voice_id": request.voice_id or ViduTTSService.DEFAULT_VOICE_ID,
|
||
},
|
||
message="文件保存成功",
|
||
)
|
||
|
||
except ValueError as e:
|
||
logger.warning(f"[Voice] TTS 参数错误: {e}")
|
||
raise HTTPException(status_code=422, detail=str(e))
|
||
except Exception as e:
|
||
logger.error(f"[Voice] TTS 文件保存失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"保存失败: {str(e)}")
|
||
|
||
|
||
def _normalize_voice_id(name: str | None) -> str:
|
||
"""
|
||
将用户输入的名称规范化为 Vidu 合法的 voice_id。
|
||
|
||
Vidu 要求:8~256 字符,首字符必须是字母。
|
||
"""
|
||
if not name:
|
||
return f"vidu_{uuid.uuid4().hex[:8]}"
|
||
|
||
# 只保留字母、数字、下划线
|
||
cleaned = re.sub(r"[^a-zA-Z0-9_]", "", name)
|
||
|
||
# 确保首字符是字母
|
||
if cleaned and not cleaned[0].isalpha():
|
||
cleaned = "v" + cleaned
|
||
elif not cleaned:
|
||
cleaned = "voice"
|
||
|
||
# 长度不足 8,补足随机字符
|
||
if len(cleaned) < 8:
|
||
cleaned = cleaned + uuid.uuid4().hex[: (8 - len(cleaned))]
|
||
|
||
# 长度超过 256,截断
|
||
if len(cleaned) > 256:
|
||
cleaned = cleaned[:256]
|
||
|
||
return cleaned
|
||
|
||
|
||
@router.post("/clone/submit", response_model=ApiResponse[VoiceCloneTaskResponse])
|
||
async def submit_clone_task(request: VoiceCloneSubmitRequest):
|
||
"""
|
||
提交声音克隆任务(Vidu)
|
||
|
||
Vidu 声音复刻是同步接口,直接返回结果。
|
||
"""
|
||
try:
|
||
voice_id = _normalize_voice_id(request.voice_name)
|
||
service = ViduTTSService()
|
||
result = await service.clone_voice(
|
||
audio_url=request.source_audio_url or "",
|
||
voice_id=voice_id,
|
||
)
|
||
|
||
# Vidu 同步返回,状态直接为 succeeded
|
||
return success_response(
|
||
data=VoiceCloneTaskResponse(
|
||
task_id=result.get("task_id", ""),
|
||
status="succeeded",
|
||
voice_id=result.get("voice_id"),
|
||
trial_url=result.get("demo_audio"),
|
||
),
|
||
message="克隆成功",
|
||
)
|
||
|
||
except ValueError as e:
|
||
logger.warning(f"[Voice] 克隆参数错误: {e}")
|
||
raise HTTPException(status_code=422, detail=str(e))
|
||
except Exception as e:
|
||
logger.error(f"[Voice] 提交克隆任务失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"提交失败: {str(e)}")
|
||
|
||
|
||
@router.get("/clone/query/{task_id}", response_model=ApiResponse[VoiceCloneTaskResponse])
|
||
async def query_clone_task(task_id: str, blocking: bool = False):
|
||
"""
|
||
查询声音克隆任务状态(Vidu)
|
||
|
||
Vidu 声音复刻是同步接口,此端点仅做兼容,直接返回成功状态。
|
||
"""
|
||
return success_response(
|
||
data=VoiceCloneTaskResponse(
|
||
task_id=task_id,
|
||
status="succeeded",
|
||
),
|
||
message="克隆已完成",
|
||
)
|
||
|
||
|
||
@router.post("/clone/clone-and-wait", response_model=ApiResponse[VoiceCloneTaskResponse])
|
||
async def clone_and_wait(request: VoiceCloneSubmitRequest, poll_interval: float = 5.0):
|
||
"""
|
||
一站式克隆(提交并等待完成)
|
||
|
||
提交克隆任务并阻塞等待结果,直接返回最终状态。
|
||
适用于需要等待克隆完成的场景。
|
||
"""
|
||
try:
|
||
voice_id = _normalize_voice_id(request.voice_name)
|
||
service = ViduTTSService()
|
||
result = await service.clone_voice(
|
||
audio_url=request.source_audio_url or "",
|
||
voice_id=voice_id,
|
||
)
|
||
|
||
return success_response(
|
||
data=VoiceCloneTaskResponse(
|
||
task_id=result.get("task_id", ""),
|
||
status="succeeded",
|
||
voice_id=result.get("voice_id"),
|
||
trial_url=result.get("demo_audio"),
|
||
),
|
||
message="克隆成功",
|
||
)
|
||
|
||
except ValueError as e:
|
||
logger.warning(f"[Voice] 克隆参数错误: {e}")
|
||
raise HTTPException(status_code=422, detail=str(e))
|
||
except Exception as e:
|
||
logger.error(f"[Voice] 克隆失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"克隆失败: {str(e)}")
|
||
|
||
|
||
# ==================== 对口型 ====================
|
||
|
||
|
||
@router.post("/lip-sync", response_model=ApiResponse[LipSyncResponse])
|
||
async def create_lip_sync(request: LipSyncRequest):
|
||
"""
|
||
创建对口型任务(异步接口)
|
||
|
||
输入视频 + 音频/文字,生成对口型视频。
|
||
返回 task_id,需通过 /lip-sync/{task_id} 查询结果。
|
||
"""
|
||
try:
|
||
if not request.audio_url and not request.text:
|
||
raise ValueError("audio_url 和 text 至少传一个")
|
||
|
||
service = ViduTTSService()
|
||
task_id = await service.lip_sync_create(
|
||
video_url=request.video_url,
|
||
audio_url=request.audio_url,
|
||
text=request.text,
|
||
voice_id=request.voice_id,
|
||
speed=request.speed,
|
||
volume=request.volume,
|
||
ref_photo_url=request.ref_photo_url,
|
||
)
|
||
|
||
return success_response(
|
||
data=LipSyncResponse(task_id=task_id, state="created"),
|
||
message="对口型任务已创建",
|
||
)
|
||
|
||
except ValueError as e:
|
||
logger.warning(f"[Voice] 对口型参数错误: {e}")
|
||
raise HTTPException(status_code=422, detail=str(e))
|
||
except Exception as e:
|
||
logger.error(f"[Voice] 对口型任务创建失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"创建失败: {str(e)}")
|
||
|
||
|
||
@router.get("/lip-sync/{task_id}", response_model=ApiResponse[LipSyncQueryResponse])
|
||
async def query_lip_sync(task_id: str):
|
||
"""
|
||
查询对口型任务状态
|
||
|
||
返回任务状态及生成物 URL(24小时有效期)。
|
||
"""
|
||
try:
|
||
service = ViduTTSService()
|
||
result = await service.lip_sync_query(task_id)
|
||
|
||
state = result.get("state", "unknown")
|
||
creations = result.get("creations", [])
|
||
video_url = creations[0].get("url") if creations else None
|
||
cover_url = creations[0].get("cover_url") if creations else None
|
||
|
||
return success_response(
|
||
data=LipSyncQueryResponse(
|
||
task_id=task_id,
|
||
state=state,
|
||
video_url=video_url,
|
||
cover_url=cover_url,
|
||
),
|
||
message=f"任务状态: {state}",
|
||
)
|
||
|
||
except Exception as e:
|
||
logger.error(f"[Voice] 查询对口型任务失败: {e}")
|
||
raise HTTPException(status_code=500, detail=f"查询失败: {str(e)}")
|
||
|
||
|