Files
meijiaka-zy/python-api/app/api/v1/voice.py
T
小鱼开发 0722225c62 feat(points): 积分流水表支持时长显示,说明字段简化
后端:
- PointTransaction 模型添加 duration 字段(float, nullable)
- PointTransactionItem schema 添加 duration
- consume() 新增 duration 参数,写入流水记录
- 各业务 description 统一简化为【脚本生成】【配音合成】等格式
- duration 类业务(tts/video)传入实际秒数
- Alembic 迁移: 95eb1a1c0af9_add_duration_to_point_transaction

前端:
- PointTransaction 类型添加 duration
- UsageDetail: 来源列 → 时长列(有值显示 xs,无值显示 -)
- 说明列直接显示后端返回的简化描述
2026-05-09 17:08:50 +08:00

531 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
语音合成与克隆 API 路由
=======================
提供 TTS 语音合成、批量合成、声音克隆等功能。
基于 Vidu API。
"""
import logging
import re
import uuid
from pathlib import Path
import httpx
from fastapi import APIRouter, Depends, File, Form, HTTPException, Request, UploadFile
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from app.api.deps import get_current_user
from app.core.exceptions import PlatformError
from app.db.session import get_db
from app.models.user import User
from app.schemas.common import ApiResponse, success_response
from app.services import point_service as ps
from app.services.vidu_service import (
DEFAULT_VOICE_ID,
ViduService,
get_preset_voices,
get_vidu_service,
)
from app.utils.audio_utils import get_audio_duration
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/voice", tags=["Voice"])
# ========== 请求/响应模型 ==========
class TTSSynthesizeRequest(BaseModel):
"""TTS 合成请求"""
text: str = Field(..., min_length=1, max_length=10000, description="待合成文本(≤10000字符)")
voice_id: str | None = Field(None, description="音色 ID(默认:甜美女性)")
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="语速 0.5-2.0")
voice_language: str = Field(default="zh", description="音色语种 (zh/en)")
volume: int = Field(default=0, ge=0, le=10, description="音量 0-100=正常)")
pitch: int = Field(default=0, ge=-12, le=12, description="音调 -12 到 12")
class TTSBatchSegment(BaseModel):
"""批量合成段落"""
text: str = Field(..., min_length=1, description="段落文本")
index: int = Field(default=0, ge=0, description="段落序号")
filename: str | None = Field(None, description="输出文件名(不含扩展名)")
class TTSBatchRequest(BaseModel):
"""批量 TTS 合成请求"""
segments: list[TTSBatchSegment] = Field(..., min_length=1, description="段落列表")
voice_id: str | None = Field(None, description="音色 ID")
speed: float = Field(default=1.0, ge=0.5, le=2.0, description="语速")
volume: int = Field(default=0, ge=0, le=10, description="音量 0-10")
pitch: int = Field(default=0, ge=-12, le=12, description="音调 -12 到 12")
class VoiceCloneSubmitRequest(BaseModel):
"""声音克隆提交请求"""
source_audio_url: str | None = Field(None, description="源音频 URL5-30秒,mp3/wav,需公开可访问)")
source_video_url: str | None = Field(None, description="源视频 URL(可选)")
video_id: str | None = Field(None, description="历史作品ID(可选)")
voice_name: str | None = Field(None, description="自定义音色名称(≤20字符)")
class TTSBatchResponse(BaseModel):
"""批量合成结果"""
total: int
success_count: int
failed_count: int
results: list[dict]
class VoiceCloneTaskResponse(BaseModel):
"""克隆任务响应"""
task_id: str
status: str
voice_id: str | None = None
trial_url: str | None = None
error_message: str | None = None
class VoiceUploadResponse(BaseModel):
"""音频上传响应"""
url: str = Field(..., description="七牛云访问 URL")
key: str = Field(..., description="存储 Key")
class VoiceInfo(BaseModel):
"""音色信息"""
voice_id: str
name: str
description: str = ""
language: str = "zh"
recommended: bool = False
previewUrl: str | None = None
# ========== API 路由 ==========
@router.post("/upload", response_model=ApiResponse[VoiceUploadResponse])
async def upload_voice_file(
file: UploadFile = File(...),
file_type: str = Form(default="audio", description="文件类型: audio | video"),
):
"""
上传音频/视频文件到七牛云
接收音频(mp3/wav)或视频(mp4/mov)文件,上传至七牛云 media bucket
返回公开访问 URL。
"""
try:
file_type = file_type.lower().strip()
if file_type not in ("audio", "video"):
raise HTTPException(status_code=400, detail="file_type 必须是 audio 或 video")
# 根据类型校验 MIME
if file_type == "audio":
allowed_types = {"audio/mpeg", "audio/mp3", "audio/wav", "audio/x-wav", "audio/mp4"}
max_size = 20 * 1024 * 1024 # 20MB
prefix = "meijiaka-zy/voice"
type_label = "音频"
else:
allowed_types = {"video/mp4", "video/quicktime"}
max_size = 200 * 1024 * 1024 # 200MB
prefix = "meijiaka-zy/voice_clone"
type_label = "视频"
content_type = file.content_type or "application/octet-stream"
if content_type not in allowed_types:
raise HTTPException(
status_code=400,
detail=f"不支持的{type_label}格式: {content_type},仅支持 {', '.join(allowed_types)}",
)
# 读取文件内容
content = await file.read()
if len(content) > max_size:
raise HTTPException(status_code=400, detail=f"{type_label}文件大小不能超过 {max_size // 1024 // 1024}MB")
# 生成存储 key
ext = content_type.split("/")[-1].replace("quicktime", "mov").replace("mpeg", "mp3")
key = f"{prefix}/{uuid.uuid4().hex}.{ext}"
# 上传到七牛云
from app.services.qiniu_service import get_qiniu_service
qiniu = get_qiniu_service()
from io import BytesIO
await qiniu.upload_stream_async(
stream=BytesIO(content),
key=key,
mime_type=content_type,
)
# 获取公开 URLmedia bucket 使用 video_domain
url = qiniu.get_file_url(qiniu.video_domain, key)
return success_response(
data=VoiceUploadResponse(url=url, key=key),
message="上传成功",
)
except HTTPException:
raise
except Exception as e:
logger.error(f"[Voice] 上传失败: {e}")
raise HTTPException(status_code=500, detail=f"上传失败: {str(e)}")
@router.get("/voices", response_model=ApiResponse[list[VoiceInfo]])
async def list_voices():
"""
获取可用音色列表
返回预设的音色选项,用户可选择喜欢的音色进行 TTS 合成。
"""
voices = get_preset_voices()
return success_response(
data=[VoiceInfo(**v) for v in voices],
message="获取音色列表成功",
)
@router.post("/synthesize", response_model=ApiResponse[dict])
async def synthesize_speech(
request: TTSSynthesizeRequest,
service: ViduService = Depends(get_vidu_service),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
同步 TTS 合成
将文本转换为语音,返回音频 URL。
适用于短文本(≤1000字),长文本建议使用 /synthesize-batch。
"""
try:
audio_url = await service.synthesize(
text=request.text,
voice_id=request.voice_id,
speed=request.speed,
volume=request.volume,
pitch=request.pitch,
)
# 探测音频时长并扣费
try:
seconds = await get_audio_duration(audio_url)
points = ps._calculate_cost("tts", {"seconds": seconds})
await ps.consume(
db,
user_id=current_user.id,
points=points,
source_type="tts",
source_id=f"tts_{current_user.id}_{asyncio.get_event_loop().time()}",
description="【配音合成】",
duration=seconds,
)
await db.commit()
except Exception as e:
logger.error(f"[Voice] TTS 扣费失败: {e}")
# 扣费失败不影响合成结果
return success_response(
data={
"audio_url": audio_url,
"format": "mp3",
"text": request.text,
"voice_id": request.voice_id or DEFAULT_VOICE_ID,
},
message="合成成功",
)
except PlatformError:
raise
except Exception as e:
logger.error(f"[Voice] TTS 合成失败: {e}")
raise HTTPException(status_code=500, detail="语音合成失败,请稍后重试")
@router.post("/synthesize-batch", response_model=ApiResponse[TTSBatchResponse])
async def synthesize_batch(
request: TTSBatchRequest,
service: ViduService = Depends(get_vidu_service),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
批量 TTS 合成
将多段文本批量转换为语音,保存到临时目录。
适用于长文本分段合成场景。
"""
try:
segments_data = [s.model_dump() for s in request.segments]
results = []
for seg in segments_data:
try:
audio_url = await service.synthesize(
text=seg["text"],
voice_id=request.voice_id,
speed=request.speed,
volume=request.volume,
pitch=request.pitch,
)
results.append({
"index": seg.get("index", 0),
"success": True,
"audio_url": audio_url,
"filename": seg.get("filename"),
})
except Exception as e:
# 批量处理:单个 segment 失败记录到结果中,不阻断其他 segment
error_msg = str(e)
if isinstance(e, PlatformError):
error_msg = f"[{e.platform}] {e.error_type}: {e}"
results.append({
"index": seg.get("index", 0),
"success": False,
"error": error_msg,
"filename": seg.get("filename"),
})
# 批量探测时长并汇总扣费
total_seconds = 0.0
for r in results:
if r["success"] and r.get("audio_url"):
try:
total_seconds += await get_audio_duration(r["audio_url"])
except Exception as e:
logger.warning(f"[Voice] 批量探测时长失败: {e}")
if total_seconds > 0:
try:
points = ps._calculate_cost("tts", {"seconds": total_seconds})
await ps.consume(
db,
user_id=current_user.id,
points=points,
source_type="tts",
source_id=f"tts_batch_{current_user.id}_{asyncio.get_event_loop().time()}",
description="【配音合成】",
duration=total_seconds,
)
await db.commit()
except Exception as e:
logger.error(f"[Voice] 批量 TTS 扣费失败: {e}")
success_count = sum(1 for r in results if r["success"])
failed_count = len(results) - success_count
return success_response(
data=TTSBatchResponse(
total=len(results),
success_count=success_count,
failed_count=failed_count,
results=results,
),
message=f"批量合成完成:成功 {success_count} 段,失败 {failed_count}",
)
except PlatformError:
raise
except Exception as e:
logger.error(f"[Voice] 批量 TTS 失败: {e}")
raise HTTPException(status_code=500, detail=f"批量合成失败: {str(e)}")
@router.post("/synthesize-file", response_model=ApiResponse[dict])
async def synthesize_to_file(
request_body: TTSSynthesizeRequest,
output_path: str,
service: ViduService = Depends(get_vidu_service),
request: Request = None,
):
"""
TTS 合成并保存到指定路径
将文本转换为语音并保存到指定文件路径。
"""
try:
audio_url = await service.synthesize(
text=request_body.text,
voice_id=request_body.voice_id,
speed=request_body.speed,
volume=request_body.volume,
pitch=request_body.pitch,
)
# 下载音频并保存到指定路径
client = request.app.state.http_clients["default"] if request else httpx.AsyncClient(timeout=30.0)
try:
response = await client.get(audio_url)
response.raise_for_status()
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
Path(output_path).write_bytes(response.content)
finally:
if not request:
await client.aclose()
return success_response(
data={
"file_path": output_path,
"text": request_body.text,
"voice_id": request_body.voice_id or DEFAULT_VOICE_ID,
},
message="文件保存成功",
)
except PlatformError:
raise
except Exception as e:
logger.error(f"[Voice] TTS 文件保存失败: {e}")
raise HTTPException(status_code=500, detail="文件保存失败,请稍后重试")
def _normalize_voice_id(name: str | None) -> str:
"""
将用户输入的名称规范化为 Vidu 合法的 voice_id。
Vidu 要求:8~256 字符,首字符必须是字母。
"""
if not name:
return f"vidu_{uuid.uuid4().hex[:8]}"
# 只保留字母、数字、下划线
cleaned = re.sub(r"[^a-zA-Z0-9_]", "", name)
# 确保首字符是字母
if cleaned and not cleaned[0].isalpha():
cleaned = "v" + cleaned
elif not cleaned:
cleaned = "voice"
# 长度不足 8,补足随机字符
if len(cleaned) < 8:
cleaned = cleaned + uuid.uuid4().hex[: (8 - len(cleaned))]
# 长度超过 256,截断
if len(cleaned) > 256:
cleaned = cleaned[:256]
return cleaned
@router.post("/clone/submit", response_model=ApiResponse[VoiceCloneTaskResponse])
async def submit_clone_task(
request: VoiceCloneSubmitRequest,
service: ViduService = Depends(get_vidu_service),
db: AsyncSession = Depends(get_db),
current_user: User = Depends(get_current_user),
):
"""
提交声音克隆任务(Vidu
Vidu 声音复刻是同步接口,直接返回结果。
"""
try:
voice_id = _normalize_voice_id(request.voice_name)
result = await service.clone_voice(
audio_url=request.source_audio_url or "",
voice_id=voice_id,
)
# 扣费
try:
points = ps._calculate_cost("voice_clone")
await ps.consume(
db,
user_id=current_user.id,
points=points,
source_type="voice_clone",
source_id=result.get("voice_id", "unknown"),
description="【声音克隆】",
)
await db.commit()
except Exception as e:
logger.error(f"[Voice] 克隆扣费失败: {e}")
# Vidu 同步返回,状态直接为 succeeded
return success_response(
data=VoiceCloneTaskResponse(
task_id=result.get("task_id", ""),
status="succeeded",
voice_id=result.get("voice_id"),
trial_url=result.get("demo_audio"),
),
message="克隆成功",
)
except PlatformError:
raise
except ValueError as e:
logger.error(f"[Voice] 提交克隆任务失败: {e}")
raise HTTPException(status_code=500, detail=f"参数错误: {e}")
except Exception as e:
logger.error(f"[Voice] 提交克隆任务失败: {e}")
raise HTTPException(status_code=500, detail=f"任务提交失败: {e}")
@router.get("/clone/query/{task_id}", response_model=ApiResponse[VoiceCloneTaskResponse])
async def query_clone_task(task_id: str, blocking: bool = False):
"""
查询声音克隆任务状态(Vidu)
Vidu 声音复刻是同步接口,此端点仅做兼容,直接返回成功状态。
"""
return success_response(
data=VoiceCloneTaskResponse(
task_id=task_id,
status="succeeded",
),
message="克隆已完成",
)
@router.post("/clone/clone-and-wait", response_model=ApiResponse[VoiceCloneTaskResponse])
async def clone_and_wait(
request: VoiceCloneSubmitRequest,
service: ViduService = Depends(get_vidu_service),
poll_interval: float = 5.0,
):
"""
一站式克隆(提交并等待完成)
提交克隆任务并阻塞等待结果,直接返回最终状态。
适用于需要等待克隆完成的场景。
"""
try:
voice_id = _normalize_voice_id(request.voice_name)
result = await service.clone_voice(
audio_url=request.source_audio_url or "",
voice_id=voice_id,
)
return success_response(
data=VoiceCloneTaskResponse(
task_id=result.get("task_id", ""),
status="succeeded",
voice_id=result.get("voice_id"),
trial_url=result.get("demo_audio"),
),
message="克隆成功",
)
except PlatformError:
raise
except Exception as e:
logger.error(f"[Voice] 克隆失败: {e}")
raise HTTPException(status_code=500, detail="声音克隆失败,请稍后重试")