diff --git a/docs/mvp-lip-sync-replacement.md b/docs/mvp-lip-sync-replacement.md new file mode 100644 index 0000000..616ce68 --- /dev/null +++ b/docs/mvp-lip-sync-replacement.md @@ -0,0 +1,122 @@ +# MVP 实验:对口型视频替换空镜片段 + +## 目标 +通过音频打轴定位时间戳,用对口型后的人物视频片段替换空镜视频的对应片段。 + +## 完整流程 + +### 1. 音频打轴(Whisper) +对配音音频进行语音识别,生成带时间轴的 SRT 字幕文件。 + +```bash +cd /Users/0fun/work/meijiaka-zj/python-api +source .venv/bin/activate +whisper "/Users/0fun/Documents/Meijiaka-zj/projects/.../audios/voice_xxx.mp3" \ + --model base --language zh --output_format srt +``` + +**输出**:56 句话,总时长 ~75s,生成 `voice_xxx.srt`。 + +### 2. 文案定位 +输入目标文案,在 SRT 中匹配对应句子,得到时间范围。 + +**示例**: +- 文案:`"新房装修这七个时间,你必须在场盯着"` +- 匹配结果:时间范围 `0.000s ~ 4.120s` +- 涉及片段:segment 1~2 + +### 3. 片段截取 + +#### 人物视频(静音画面 + 对口型音频) +```bash +# 画面:0~4.12s +ffmpeg -y -ss 0 -t 4.12 -i video.mp4 -c:v libx264 -an clip_video.mp4 + +# 音频:0~4.12s +ffmpeg -y -ss 0 -t 4.12 -i video.mp4 -vn -c:a copy clip_audio.mp3 +``` + +**音频检测**: +- mean_volume: -20.8 dB +- max_volume: -3.8 dB +- 格式:24000 Hz / mono / 69 kb/s + +### 4. 画面替换(FFmpeg overlay) +将人物视频画面覆盖到 composed 视频对应时间段。 + +```bash +ffmpeg -y \ + -i composed.mp4 \ + -i video.mp4 \ + -filter_complex \ + "[1:v]setpts=PTS-STARTPTS[clip]; + [0:v][clip]overlay=enable='between(t\,0\,4.12)':x=0:y=0[v]" \ + -map "[v]" -map 0:a \ + -c:v libx264 -crf 18 -preset fast \ + -c:a copy \ + composed_overlay.mp4 +``` + +### 5. 音频拼接(关键步骤) +将对口型音频插入到 0~4.12s,原 composed 音频接在后面。 + +#### ❌ 第一次尝试(失败) +```bash +[a_rep]atrim=start=0:end=4.12...; +[a_tail]atrim=start=4.12...; +[a_rep][a_tail]concat=n=2:v=0:a=1[a] +``` + +**问题**:concat 要求所有输入流的**采样率和声道数一致**。 +- 对口型音频:24000 Hz / mono +- composed 音频:44100 Hz / stereo + +结果:拼接后开头 4 秒无声。 + +#### ✅ 修复方案 +在 concat 前统一音频格式: + +```bash +[1:a]aresample=44100,pan=stereo|c0=c0|c1=c0,atrim=start=0:end=4.12,asetpts=PTS-STARTPTS[a_rep]; +[0:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo,atrim=start=4.12,asetpts=PTS-STARTPTS[a_tail]; +[a_rep][a_tail]concat=n=2:v=0:a=1[a] +``` + +### 6. 完整替换命令(最终版) + +```bash +ffmpeg -y \ + -i "composed.mp4" \ + -i "video.mp4" \ + -filter_complex \ + "[1:v]setpts=PTS-STARTPTS[clip]; + [0:v][clip]overlay=enable='between(t\,0\,4.12)':x=0:y=0[v]; + [1:a]aresample=44100,pan=stereo|c0=c0|c1=c0,atrim=start=0:end=4.12,asetpts=PTS-STARTPTS[a_rep]; + [0:a]aformat=sample_fmts=fltp:sample_rates=44100:channel_layouts=stereo,atrim=start=4.12,asetpts=PTS-STARTPTS[a_tail]; + [a_rep][a_tail]concat=n=2:v=0:a=1[a]" \ + -map "[v]" \ + -map "[a]" \ + -c:v libx264 -crf 18 -preset fast \ + -c:a aac -b:a 128k \ + "composed_replaced.mp4" +``` + +**输出验证**: +- 视频:H.264 / 1080x1920 / 29.82 fps +- 音频:AAC / 44100 Hz / stereo / 37 kb/s +- 前 4.12s 音量:mean -20.9 dB / max -3.7 dB(正常有声) +- 总时长:74.27 秒 + +## 踩坑记录 + +| 问题 | 原因 | 解决方案 | +|------|------|----------| +| 音频拼接后开头无声 | concat 输入流采样率/声道不一致 | 用 `aresample` + `pan` 统一为 44100 Hz stereo | +| 视频画面不同步 | overlay 时间范围写错 | 确认 `between(t,0,4.12)` 与文案定位时间一致 | +| 音频格式差异 | 对口型音频 24000Hz mono,composed 44100Hz stereo | 拼接前强制格式统一 | + +## 后续可优化 + +1. **音频过渡**:concat 是硬切,可在拼接处加 `acrossfade` 实现淡入淡出 +2. **音量平衡**:对口型音频与 composed 音频音量差异大,可用 `volume` 滤镜统一 +3. **自动化**:将以上步骤封装为 Python/Rust 函数,输入文案自动完成定位→截取→替换 diff --git a/python-api/app/ai/prompts/polish/scene_empty_shot.txt b/python-api/app/ai/prompts/polish/scene_empty_shot.txt deleted file mode 100644 index 3a9b217..0000000 --- a/python-api/app/ai/prompts/polish/scene_empty_shot.txt +++ /dev/null @@ -1,13 +0,0 @@ -你是一位口播短视频专家。请润色以下空镜画面描述,使其更适合AI视频生成: - -【原文】 -{content} - -【要求】 -- 保持原意,优化细节 -- 重点强调场景环境、空间氛围、光影效果、材质质感 -- 可以描述静态景物、装修细节、空间布局 -- 不要有"镜头""特写""机位"等摄影术语 -- 控制好字数,字数不能与原文差距超过20个字 - -直接输出润色后的描述,不要添加任何说明: diff --git a/python-api/app/ai/prompts/polish/scene_segment.txt b/python-api/app/ai/prompts/polish/scene_segment.txt deleted file mode 100644 index 5f7a4d3..0000000 --- a/python-api/app/ai/prompts/polish/scene_segment.txt +++ /dev/null @@ -1,13 +0,0 @@ -你是一位【口播短视频】专家。请润色以下分镜画面描述,使其更适合AI视频生成: - -【原文】 -{content} - -【要求】 -- 保持原意,优化细节 -- 重点强调人物神态、表情、动作、姿态 -- 描述人物与镜头前观众的互动 -- 不要有"镜头""特写""机位"等摄影术语 -- 控制好字数,字数不能与原文差距超过20个字 - -直接输出润色后的描述,不要添加任何说明: diff --git a/python-api/app/ai/prompts/system/bk/jg/1.txt b/python-api/app/ai/prompts/system/bk/jg/1.txt new file mode 100644 index 0000000..76f1f77 --- /dev/null +++ b/python-api/app/ai/prompts/system/bk/jg/1.txt @@ -0,0 +1,71 @@ +你是一位专业的【口播类短视频】脚本创作专家,专注于家装/装修领域的抖音/视频号口播内容创作。 +【平台适配要求】 +1. 竖屏拍摄(9:16比例),画面构图以人物为主体 +2. 台词口语化、接地气,像跟朋友聊天,避免"综上所述""研究表明"等书面语 +3. 语速稍快有节奏感,每秒4个纯文字(不含标点),每句15-25字(对应3.75-6.25秒),一口气说完不换气,不拖沓 +4. 避免专业术语堆砌,用业主听得懂的大白话 +5. 符合新媒体用户观看习惯:3秒定生死,节奏紧凑 +【文案要求】 +请严格按照以下固定结构,生成装修现场监工类口播文案,要求语言口语化、有警示性,贴合装修业主视角,结构严格不变,内容围绕“新房装修一定要在场的7个时间”展开,每部分内容完整,总文案包含标点符号不得超过450字: +开篇总起:明确核心警示——新房装修一定要在场的7个时间,尤其最后一个,直接关系家里是不是甲醛房,提醒认真看完,避免后期返工、踩坑受害,语气直接、有紧迫感。 +分点阐述(7点,严格遵循此顺序和格式): +每点均按照“监工场景+必做核查事项+不盯工的核心隐患”撰写,语言接地气,有劝诫感,避免生硬说教: +第1点:砸墙时必须在场,盯紧师傅封好下水口,避免管道堵塞,后期还要跑楼下疏通 +第2点:封窗时一定要在场,监督师傅做好防水斜坡,防止下雨天雨水往屋里倒灌 +第3点:水电验收必须在场,核对插座点位、检查强弱电包裹情况,记得拍照留存,避免后期返工 +第4点:防水瓷砖验收必在场,做闭水试验检查是否漏水,核对瓷砖型号,防止色差导致重铺 +第5点:贴砖时要在场,检查瓷砖平整度、空鼓率,确保阴阳角方正、缝隙均匀合格 +第6点:木工吊顶必在场,要求拐角用整板、接缝做V型槽,杜绝后期乳胶漆开裂 +第7点:刮腻子一定要在场,严禁师傅往腻子中加胶水,避免甲醛超标,变成毒气房 +结尾引流:补充提示——准备装修的朋友,我整理了避坑手册,评论区回避坑直接领取参考,帮你装修少踩坑,语气亲切贴合业主需求。 +提示:文案整体风格通俗好记,有代入感,符合普通装修业主的认知,避免专业术语过多,每部分内容饱满,不遗漏核心要点,严格匹配上述结构,不新增、不删减板块。 +【素材库标题】 +网红开篇 +铺砖施工 +吊顶施工 +美缝施工 +水电施工 +壁纸施工 +刮腻子 +木工施工 +柜子安装 +乳胶漆 +签合同 +背景墙 +【分镜结构】 +开篇的分镜为:网红开头+人物出镜3秒+空镜补充 +分点阐述全部用空镜 +结尾人物出镜3秒+空镜补充 +每个分镜时长不得少于3秒,且不得高于8秒,可以是一位小数,如3.1秒 +且每个分镜配音文案的文字数量对应每秒4-5个纯文字(不含标点) +总分镜时长为:文案总字数/4 +"segment"(主播口播出镜)对应"人物出镜",且时长为3秒(对应12字左右纯文字) +人物出镜画面的内容,可以不用完整的句子,句子可以延伸到下一个画面 +"empty_shot"(空镜补充)对应"素材库标题" +配音文案必须要有标点符号断句,避免大长句,如:水电装错毁一生,错一个,返工就要好几万。 +【输出格式要求】 +输出的内容必须包含以下两部分 +一、分镜内容 +- id:1 +- type:"segment"(主播口播出镜)或 "empty_shot"(空镜补充) +- scene:"人物出镜"或"素材库标题" +- voiceover: 配音文案(必填,口语化15-25字/句,对应4-6秒) +- duration: 时长(如 "5s",根据字数生成,严格按每秒4字、不含标点,可保留2位小数,如12个字3.00s,17个字4.25s,19个字4.75s) +注意:只输出纯 JSON,不要包含 markdown 代码块或其他说明文字。 +【示例】 +[ + { + "id": 1, + "type": "empty_shot", + "scene": "网红开篇", + "voiceover": "装修签合同别踩坑!固定模板千万别直接签!", + "duration": 3 + }, + { + "id": 2, + "type": "segment", + "scene": "人物出镜", + "voiceover": "这8条内容,必须白纸黑字写进合同里!", + "duration": 3 + } +] \ No newline at end of file diff --git a/python-api/app/ai/providers/vidu_provider.py b/python-api/app/ai/providers/vidu_provider.py index 112e38b..678dbea 100644 --- a/python-api/app/ai/providers/vidu_provider.py +++ b/python-api/app/ai/providers/vidu_provider.py @@ -74,12 +74,14 @@ class ViduProvider: if payload: body["payload"] = payload + logger.info(f"[Vidu TTS] 请求参数: text_length={len(text)}, body={body}") + async with aiohttp.ClientSession() as session: async with session.post(url, json=body, headers=self._get_headers()) as resp: data = await resp.json() if resp.status != 200 or data.get("state") == "failed": msg = data.get("err_code") or data.get("message") or f"HTTP {resp.status}" - logger.error(f"[Vidu TTS] 请求失败: url={url}, status={resp.status}, headers={self._get_headers()}, body={body}, response={data}") + logger.error(f"[Vidu TTS] 请求失败: url={url}, status={resp.status}, response={data}") raise Exception(f"Vidu TTS error: {msg}") return data diff --git a/python-api/app/api/v1/system.py b/python-api/app/api/v1/system.py index 254b140..7f6241c 100644 --- a/python-api/app/api/v1/system.py +++ b/python-api/app/api/v1/system.py @@ -3,8 +3,10 @@ ============ """ -from fastapi import APIRouter +from fastapi import APIRouter, status +from fastapi.responses import JSONResponse +from app.core.health_checker import check_database, check_redis from app.schemas.common import ApiResponse, success_response router = APIRouter() @@ -13,16 +15,34 @@ router = APIRouter() @router.get("/health", response_model=ApiResponse[dict]) async def system_health(): """系统健康检查(详细版)""" - return success_response( - data={ - "status": "healthy", - "services": { - "api": "up", - "database": "unknown", # TODO: 检查数据库连接 - "redis": "unknown", # TODO: 检查 Redis 连接 + db_ok, db_msg = await check_database() + redis_ok, redis_msg = await check_redis() + + services = { + "api": "up", + "database": "connected" if db_ok else db_msg, + "redis": "connected" if redis_ok else redis_msg, + } + + if not db_ok: + return JSONResponse( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + content={ + "code": status.HTTP_503_SERVICE_UNAVAILABLE, + "message": "数据库连接异常", + "data": {"status": "unhealthy", "services": services}, }, - }, + ) + + if not redis_ok: + return success_response( + message="Redis 连接异常,服务降级", + data={"status": "degraded", "services": services}, + ) + + return success_response( message="系统运行正常", + data={"status": "healthy", "services": services}, ) diff --git a/python-api/app/api/v1/upload.py b/python-api/app/api/v1/upload.py index 1216b51..b641637 100644 --- a/python-api/app/api/v1/upload.py +++ b/python-api/app/api/v1/upload.py @@ -187,3 +187,87 @@ async def upload_image( except Exception as e: logger.error(f"[Upload] 图片上传失败: {e}") raise HTTPException(status_code=500, detail=f"上传失败: {e}") + + +@router.post("/audio", response_model=ApiResponse[UploadResponse]) +async def upload_audio( + file: UploadFile = File(..., description="音频文件"), +): + """ + 上传音频到七牛云 + + 支持格式:mp3, wav, aac, m4a, ogg, flac + """ + try: + allowed_types = { + "audio/mpeg", + "audio/mp3", + "audio/wav", + "audio/x-wav", + "audio/aac", + "audio/mp4", + "audio/ogg", + "audio/flac", + "audio/x-flac", + } + content_type = file.content_type or "" + + if not content_type: + ext = Path(file.filename or "").suffix.lower() + ext_to_mime = { + ".mp3": "audio/mpeg", + ".wav": "audio/wav", + ".aac": "audio/aac", + ".m4a": "audio/mp4", + ".ogg": "audio/ogg", + ".flac": "audio/flac", + } + content_type = ext_to_mime.get(ext, "") + + if content_type not in allowed_types: + raise HTTPException( + status_code=400, + detail=f"不支持的音频格式: {content_type},请上传 mp3/wav/aac/m4a/ogg/flac", + ) + + content = await file.read() + if not content: + raise HTTPException(status_code=400, detail="文件内容为空") + + ext = Path(file.filename or "audio.mp3").suffix or ".mp3" + unique_name = f"{uuid.uuid4().hex[:16]}{ext}" + + qiniu = get_qiniu_service() + # 复用视频 bucket(或根据配置使用音频 bucket) + bucket, domain = qiniu._get_bucket_and_domain("video") + key = qiniu.generate_key("audio", unique_name) + stream = io.BytesIO(content) + result = qiniu.upload_stream( + stream=stream, + key=key, + mime_type=content_type or "audio/mpeg", + bucket=bucket, + domain=domain, + ) + + url = result.get("url") + key = result.get("key") + + if not url: + raise HTTPException(status_code=500, detail="上传到七牛云失败:未返回 URL") + + logger.info(f"[Upload] 音频上传成功: {url[:80]}..., size={len(content)}") + + return success_response( + data=UploadResponse( + url=url, + key=key or unique_name, + size=len(content), + ) + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"[Upload] 音频上传失败: {e}") + raise HTTPException(status_code=500, detail=f"上传失败: {e}") diff --git a/python-api/app/core/health_checker.py b/python-api/app/core/health_checker.py new file mode 100644 index 0000000..c48ac3d --- /dev/null +++ b/python-api/app/core/health_checker.py @@ -0,0 +1,34 @@ +""" +健康检查 +======== +""" + +import asyncio + +from sqlalchemy import text + +from app.core.redis_client import get_redis_client +from app.db.session import async_engine + + +async def check_database(timeout: float = 2.0) -> tuple[bool, str]: + """检查数据库连接""" + try: + async with asyncio.timeout(timeout): + async with async_engine.connect() as conn: + await conn.execute(text("SELECT 1")) + await conn.commit() + return True, "connected" + except Exception as e: + return False, str(e) + + +async def check_redis(timeout: float = 2.0) -> tuple[bool, str]: + """检查 Redis 连接""" + try: + async with asyncio.timeout(timeout): + redis = get_redis_client() + await redis.ping() + return True, "connected" + except Exception as e: + return False, str(e) diff --git a/python-api/app/services/material_service.py b/python-api/app/services/material_service.py index 06dd4cf..12b7251 100644 --- a/python-api/app/services/material_service.py +++ b/python-api/app/services/material_service.py @@ -64,9 +64,9 @@ def match_material(scene: str, required_duration: float, exclude_urls: list[str] 根据场景描述和所需时长匹配空镜素材 策略: - 1. 收集所有满足时长要求(duration >= required_duration)的素材 - 2. 收集全局差值最近的 5 个素材 - 3. 合并去重后从候选池中随机选取,优先排除已使用的 + 1. 严格匹配分类(scene 必须完全匹配 keywords 中的关键词) + 2. 过滤掉时长小于 required_duration 的素材 + 3. 从剩余素材中排除已使用的,随机选取 Args: scene: 分镜场景描述 @@ -78,38 +78,26 @@ def match_material(scene: str, required_duration: float, exclude_urls: list[str] """ exclude_urls = exclude_urls or [] for keyword, slug in _keywords.items(): - if keyword in scene: + if keyword == scene: all_materials = _materials.get(slug, []) if not all_materials: return None - # 1. 满足时长要求的素材 + # 1. 过滤掉时长小于 required_duration 的素材 matching = [m for m in all_materials if m["duration"] >= required_duration] + if not matching: + return None - # 2. 差值最近的 5 个素材(全局) - sorted_by_diff = sorted(all_materials, key=lambda m: abs(m["duration"] - required_duration)) - closest_5 = sorted_by_diff[:5] - - # 3. 合并候选池并去重(matching 在前,优先保留满足时长的) - candidate_pool = [] - seen = set() - for m in matching + closest_5: - if m["url"] not in seen: - candidate_pool.append(m) - seen.add(m["url"]) - - # 4. 排除已使用的,从中随机选 - unused = [m for m in candidate_pool if m["url"] not in exclude_urls] + # 2. 排除已使用的,从中随机选 + unused = [m for m in matching if m["url"] not in exclude_urls] if unused: return random.choice(unused) - # 5. 严格模式下不允许返回已排除的素材 + # 3. 严格模式下不允许返回已排除的素材 if strict: return None - # 6. 非严格模式:全部用完则允许重复 - if candidate_pool: - return random.choice(candidate_pool) + # 4. 非严格模式:全部用完则允许重复 + return random.choice(matching) - return None return None diff --git a/python-api/config/materials.json b/python-api/config/materials.json index cae49b1..c4825ad 100644 --- a/python-api/config/materials.json +++ b/python-api/config/materials.json @@ -25,13 +25,16 @@ "灯槽灯带": "ceiling", "乳胶漆色卡": "paint", "墙面工艺": "paint", + "刮腻子": "paint", "艺术漆选样": "paint", "腻子打磨": "putty", "橱柜": "cabinet", "木工施工": "cabinet", "验收标准": "final", "网红开篇": "intro", - "合同": "contract" + "壁纸施工": "wallpaper", + "合同": "contract", + "签合同": "contract" }, "materials": { "ceiling": [ diff --git a/python-api/docker-compose.dev.yml b/python-api/docker-compose.dev.yml new file mode 100644 index 0000000..e6000bc --- /dev/null +++ b/python-api/docker-compose.dev.yml @@ -0,0 +1,105 @@ +# 美家卡智剪 - 开发服务器配置 +# 自包含:PostgreSQL + Redis + API + Scheduler +# usage: docker compose -f docker-compose.dev.yml up -d --build + +version: "3.8" + +services: + db: + image: postgres:15-alpine + container_name: meijiaka-dev-db + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: meijiaka_dev + volumes: + - postgres_dev_data:/var/lib/postgresql/data + ports: + - "127.0.0.1:5432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - dev-network + + redis: + image: redis:7-alpine + container_name: meijiaka-dev-redis + volumes: + - redis_dev_data:/data + ports: + - "127.0.0.1:6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - dev-network + + api: + build: + context: . + dockerfile: Dockerfile + container_name: meijiaka-dev-api + environment: + - ENV=development + - DEBUG=true + - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/meijiaka_dev + - REDIS_HOST=redis + - REDIS_PORT=6379 + - REDIS_DB=0 + - SECRET_KEY=dev-secret-key-do-not-use-in-prod + - MINIMAX_API_KEY=${MINIMAX_API_KEY} + - MINIMAX_BASE_URL=${MINIMAX_BASE_URL:-https://api.minimaxi.com} + - VIDU_API_KEY=${VIDU_API_KEY} + - VIDU_BASE_URL=${VIDU_BASE_URL:-https://api.vidu.cn} + - LOG_LEVEL=DEBUG + volumes: + - .:/app + - ../data:/root/Documents/Meijiaka-zj + ports: + - "8080:8000" + command: gunicorn app.main:app -w 1 -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:8000 --reload + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + networks: + - dev-network + + scheduler: + build: + context: . + dockerfile: Dockerfile + container_name: meijiaka-dev-scheduler + environment: + - ENV=development + - DEBUG=true + - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/meijiaka_dev + - REDIS_HOST=redis + - REDIS_PORT=6379 + - REDIS_DB=0 + - SECRET_KEY=dev-secret-key-do-not-use-in-prod + volumes: + - .:/app + - ../data:/root/Documents/Meijiaka-zj + command: python -m app.scheduler.main + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + networks: + - dev-network + +volumes: + postgres_dev_data: + redis_dev_data: + +networks: + dev-network: + driver: bridge diff --git a/scripts/video-replace-mvp.py b/scripts/video-replace-mvp.py new file mode 100755 index 0000000..df23e97 --- /dev/null +++ b/scripts/video-replace-mvp.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 +""" +视频片段替换 MVP +================ + +基于音频文字内容,用人物视频的对应片段替换空镜视频的对应片段。 + +前置依赖: + pip install openai-whisper + +用法示例: + python scripts/video-replace-mvp.py \ + --person person.mp4 \ + --broll broll.mp4 \ + --query "水电改造要注意" + +原理: + 1. Whisper 识别人物视频音频 → 输出带时间戳的文案 + 2. 文本匹配找到目标时间段 [start, end] + 3. FFmpeg overlay 滤镜:在 [start, end] 区间用人物画面覆盖空镜画面 +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import subprocess +import sys +from difflib import SequenceMatcher +from pathlib import Path + + +def check_dep(name: str) -> str | None: + """检查系统命令是否存在""" + path = shutil.which(name) + return path + + +def ensure_whisper(): + """确保 whisper 可用""" + try: + import whisper # noqa: F401 + return True + except ImportError: + print("❌ 未安装 openai-whisper") + print(" 安装命令:pip install openai-whisper") + print(" (首次会自动下载模型,base 模型约 150MB)") + return False + + +def run_whisper(video_path: str, model: str = "base") -> list[dict]: + """Whisper 识别,返回 segment 列表(含 start/end/text)""" + import whisper + + print(f" 加载模型:{model}") + model_obj = whisper.load_model(model) + + print(f" 识别中...(模型:{model},视频:{Path(video_path).name})") + result = model_obj.transcribe( + video_path, + language="zh", + word_timestamps=False, # segment 级别够用了 + fp16=False, # CPU 友好 + ) + return result["segments"] + + +def find_time_range( + segments: list[dict], + query: str, + threshold: float = 0.6, +) -> tuple[float, float, str] | None: + """ + 根据查询文字匹配时间段 + + 匹配策略(优先级递减): + 1. 精确子串匹配 + 2. 模糊匹配(最长公共子序列相似度 ≥ threshold) + """ + query = query.strip() + + # 1. 精确子串匹配 + for seg in segments: + text = seg["text"].strip() + if query in text: + return seg["start"], seg["end"], text + + # 2. 模糊匹配 + best = None + best_score = 0.0 + for seg in segments: + text = seg["text"].strip() + score = SequenceMatcher(None, query, text).ratio() + if score > best_score and score >= threshold: + best_score = score + best = seg + + if best: + return best["start"], best["end"], best["text"].strip() + + return None + + +def get_video_info(video_path: str) -> dict: + """ffprobe 获取视频信息""" + cmd = [ + "ffprobe", "-v", "error", + "-select_streams", "v:0", + "-show_entries", "stream=width,height,r_frame_rate,duration", + "-show_entries", "format=duration", + "-of", "json", + video_path, + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + data = json.loads(result.stdout) + + stream = data.get("streams", [{}])[0] + fmt = data.get("format", {}) + + # 解析帧率(如 "25/1" → 25.0) + fps_str = stream.get("r_frame_rate", "25/1") + if "/" in fps_str: + num, den = fps_str.split("/") + fps = float(num) / float(den) + else: + fps = float(fps_str) + + return { + "width": stream.get("width", 1920), + "height": stream.get("height", 1080), + "fps": fps, + "duration": float(fmt.get("duration", stream.get("duration", 0))), + } + + +def replace_with_overlay( + person_video: str, + broll_video: str, + start: float, + end: float, + output: str, + crf: int = 18, +): + """ + 用 FFmpeg overlay 滤镜替换片段 + + 逻辑: + - 输入0 (broll):底图 + 音频 + - 输入1 (person):被截取的画面片段 + - [1:v] trim → 截取 [start, end] → setpts 归零 → scale 适配分辨率 + - [0:v][clip] overlay → 在 between(t,start,end) 时显示 clip + - 输出:画面 = 替换后的视频,音频 = 原 broll 音频 + """ + duration = end - start + broll_info = get_video_info(broll_video) + w, h = broll_info["width"], broll_info["height"] + + print(f" 空镜分辨率:{w}x{h}, 帧率:{broll_info['fps']:.2f}fps") + print(f" 截取人物片段:{start:.3f}s ~ {end:.3f}s({duration:.3f}s)") + print(f" 正在渲染...(CRF={crf})") + + # overlay filter + # 注意:between(t,start,end) 中的逗号需要转义 + filter_graph = ( + f"[1:v]trim=start={start}:end={end}," + f"setpts=PTS-STARTPTS," + f"scale={w}:{h}:force_original_aspect_ratio=decrease," + f"pad={w}:{h}:(ow-iw)/2:(oh-ih)/2:black[clip];" + f"[0:v][clip]overlay=" + f"enable='between(t\\,{start}\\,{end})':" + f"x=(W-w)/2:y=(H-h)/2[v]" + ) + + cmd = [ + "ffmpeg", "-y", + "-i", broll_video, + "-i", person_video, + "-filter_complex", filter_graph, + "-map", "[v]", + "-map", "0:a", + "-c:v", "libx264", "-crf", str(crf), "-preset", "fast", + "-c:a", "copy", + "-movflags", "+faststart", + output, + ] + + subprocess.run(cmd, check=True, capture_output=True) + print(f"✅ 输出完成:{output}") + + +def save_srt(segments: list[dict], path: str): + """保存 SRT 字幕供人工校对""" + def fmt(s: float) -> str: + h = int(s // 3600) + m = int((s % 3600) // 60) + sec = int(s % 60) + ms = int((s % 1) * 1000) + return f"{h:02d}:{m:02d}:{sec:02d},{ms:03d}" + + with open(path, "w", encoding="utf-8") as f: + for i, seg in enumerate(segments, 1): + f.write(f"{i}\n{fmt(seg['start'])} --> {fmt(seg['end'])}\n{seg['text'].strip()}\n\n") + + +def main(): + parser = argparse.ArgumentParser(description="基于音频文字的视频片段替换 MVP") + parser.add_argument("--person", required=True, help="人物出镜视频路径(提供画面)") + parser.add_argument("--broll", required=True, help="空镜视频路径(提供底图+音频)") + parser.add_argument("--query", required=True, help="要替换的文案(如:水电改造要注意)") + parser.add_argument("--output", default="output_replaced.mp4", help="输出文件路径") + parser.add_argument("--model", default="base", choices=["tiny", "base", "small"], + help="Whisper 模型,tiny 最快,small 最准") + parser.add_argument("--crf", type=int, default=18, help="视频质量(0=无损,23=默认,越大越小)") + parser.add_argument("--threshold", type=float, default=0.6, + help="模糊匹配阈值(0~1),低于此值视为未匹配") + args = parser.parse_args() + + # 0. 依赖检查 + if not check_dep("ffmpeg"): + print("❌ 未找到 ffmpeg") + sys.exit(1) + + if not check_dep("ffprobe"): + print("❌ 未找到 ffprobe") + sys.exit(1) + + if not ensure_whisper(): + sys.exit(1) + + for p in (args.person, args.broll): + if not Path(p).exists(): + print(f"❌ 文件不存在:{p}") + sys.exit(1) + + # 1. ASR 识别人物视频 + print(f"\n🎙️ Step 1/3:识别人物视频音频") + segments = run_whisper(args.person, args.model) + print(f" 识别到 {len(segments)} 句话") + + # 保存字幕供参考 + srt_path = str(Path(args.output).with_suffix(".srt")) + save_srt(segments, srt_path) + print(f"📝 字幕已保存:{srt_path}") + + # 2. 文本匹配 + print(f"\n🔍 Step 2/3:查找文案「{args.query}」") + result = find_time_range(segments, args.query, threshold=args.threshold) + if not result: + print(f"❌ 未找到匹配文案(阈值 {args.threshold})") + print(f" 建议:查看 {srt_path} 里的实际文案,调整 --query 内容") + sys.exit(1) + + start, end, matched_text = result + print(f" 匹配文案:「{matched_text}」") + print(f" 时间段: {start:.3f}s ~ {end:.3f}s(时长 {end - start:.3f}s)") + + # 3. FFmpeg 替换 + print(f"\n🎬 Step 3/3:替换片段") + replace_with_overlay( + args.person, + args.broll, + start, + end, + args.output, + crf=args.crf, + ) + + print(f"\n🎉 全部完成!") + print(f" 输出文件:{args.output}") + print(f" 字幕参考:{srt_path}") + + +if __name__ == "__main__": + main() diff --git a/tauri-app/src-tauri/src/commands/video_compose.rs b/tauri-app/src-tauri/src/commands/video_compose.rs index c1be34a..b07b1ac 100644 --- a/tauri-app/src-tauri/src/commands/video_compose.rs +++ b/tauri-app/src-tauri/src/commands/video_compose.rs @@ -39,13 +39,16 @@ pub struct ComposeVideoResult { pub duration: f64, } -/// 上传视频响应 +/// 通用上传响应 #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] -pub struct UploadVideoResult { +pub struct UploadResult { pub url: String, } +/// 兼容旧命名 +pub type UploadVideoResult = UploadResult; + /// 获取项目视频目录 fn get_project_video_dir(project_id: &str) -> Result { let docs_dir = dirs::document_dir().ok_or("无法获取文档目录")?; @@ -166,6 +169,53 @@ pub async fn compose_video( } } +/// 截取视频片段请求参数 +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ExtractVideoSegmentArgs { + pub input_path: String, + pub start: f64, + pub duration: f64, + pub output_path: String, +} + +/// 截取视频片段(FFmpeg clip_video 封装) +#[tauri::command] +pub async fn extract_video_segment( + app: AppHandle, + args: ExtractVideoSegmentArgs, +) -> ApiResponse { + let safe_output = match sanitize_output_path(&args.output_path) { + Ok(p) => p, + Err(e) => return ApiResponse { code: 500, message: e, data: None }, + }; + + let safe_input = if args.input_path.starts_with("http://") || args.input_path.starts_with("https://") { + args.input_path.clone() + } else if std::path::Path::new(&args.input_path).exists() { + args.input_path.clone() + } else { + return ApiResponse { + code: 500, + message: format!("输入文件不存在: {}", args.input_path), + data: None, + }; + }; + + match ffmpeg_cmd::clip_video(&app, &safe_input, args.start, args.duration, &safe_output).await { + Ok(_) => ApiResponse { + code: 200, + message: "视频片段截取成功".to_string(), + data: Some(safe_output), + }, + Err(e) => ApiResponse { + code: 500, + message: format!("截取视频片段失败: {}", e), + data: None, + }, + } +} + /// 上传视频请求参数 #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] @@ -173,47 +223,45 @@ pub struct UploadVideoArgs { pub local_path: String, } -/// 上传本地视频到后端,后端上传到七牛云并返回 URL -#[tauri::command] -pub async fn upload_video_file( - local_path: String, -) -> ApiResponse { - // 读取本地文件 - let file_bytes = match std::fs::read(&local_path) { +/// 通用文件上传(本地 → 后端 → 七牛云) +pub async fn upload_file_to_backend( + local_path: &str, + endpoint: &str, + default_filename: &str, + mime_type: &str, + read_error_prefix: &str, +) -> ApiResponse { + let file_bytes = match std::fs::read(local_path) { Ok(bytes) => bytes, Err(e) => { return ApiResponse { code: 500, - message: format!("读取视频文件失败: {}", e), + message: format!("{}: {}", read_error_prefix, e), data: None, }; } }; - // 获取文件名 - let filename = std::path::Path::new(&local_path) + let filename = std::path::Path::new(local_path) .file_name() .and_then(|n| n.to_str()) - .unwrap_or("video.mp4") + .unwrap_or(default_filename) .to_string(); - // 构建 multipart 请求 let backend_url = crate::PYTHON_API_BASE_URL; - let upload_url = format!("{}/upload/video", backend_url); + let upload_url = format!("{}{}", backend_url, endpoint); let client = reqwest::Client::new(); - // 构建 multipart form let form = reqwest::multipart::Form::new() .part( "file", reqwest::multipart::Part::bytes(file_bytes) .file_name(filename) - .mime_str("video/mp4") + .mime_str(mime_type) .unwrap_or_else(|_| reqwest::multipart::Part::bytes(vec![])), ); - // 发送请求 let response = match client.post(&upload_url).multipart(form).send().await { Ok(resp) => resp, Err(e) => { @@ -235,7 +283,6 @@ pub async fn upload_video_file( }; } - // 解析响应 let result: serde_json::Value = match response.json().await { Ok(data) => data, Err(e) => { @@ -247,7 +294,6 @@ pub async fn upload_video_file( } }; - // 提取 URL let url = result .get("data") .and_then(|d| d.get("url")) @@ -258,7 +304,7 @@ pub async fn upload_video_file( Some(url) => ApiResponse { code: 200, message: "上传成功".to_string(), - data: Some(UploadVideoResult { url }), + data: Some(UploadResult { url }), }, None => ApiResponse { code: 500, @@ -268,30 +314,25 @@ pub async fn upload_video_file( } } +/// 上传本地视频到后端,后端上传到七牛云并返回 URL +#[tauri::command] +pub async fn upload_video_file( + local_path: String, +) -> ApiResponse { + upload_file_to_backend( + &local_path, + "/upload/video", + "video.mp4", + "video/mp4", + "读取视频文件失败", + ).await +} + /// 上传本地图片到后端,后端上传到七牛云并返回 URL #[tauri::command] pub async fn upload_image_file( local_path: String, ) -> ApiResponse { - // 读取本地文件 - let file_bytes = match std::fs::read(&local_path) { - Ok(bytes) => bytes, - Err(e) => { - return ApiResponse { - code: 500, - message: format!("读取图片文件失败: {}", e), - data: None, - }; - } - }; - - // 获取文件名和扩展名,推断 mime type - let filename = std::path::Path::new(&local_path) - .file_name() - .and_then(|n| n.to_str()) - .unwrap_or("image.jpg") - .to_string(); - let ext = std::path::Path::new(&local_path) .extension() .and_then(|e| e.to_str()) @@ -306,75 +347,13 @@ pub async fn upload_image_file( _ => "image/jpeg", }; - // 构建 multipart 请求 - let backend_url = crate::PYTHON_API_BASE_URL; - let upload_url = format!("{}/upload/image", backend_url); - - let client = reqwest::Client::new(); - - // 构建 multipart form - let form = reqwest::multipart::Form::new() - .part( - "file", - reqwest::multipart::Part::bytes(file_bytes) - .file_name(filename) - .mime_str(mime_type) - .unwrap_or_else(|_| reqwest::multipart::Part::bytes(vec![])), - ); - - // 发送请求 - let response = match client.post(&upload_url).multipart(form).send().await { - Ok(resp) => resp, - Err(e) => { - return ApiResponse { - code: 500, - message: format!("上传请求失败: {}", e), - data: None, - }; - } - }; - - if !response.status().is_success() { - let status = response.status(); - let error_text = response.text().await.unwrap_or_default(); - return ApiResponse { - code: status.as_u16() as i32, - message: format!("上传失败: {} - {}", status, error_text), - data: None, - }; - } - - // 解析响应 - let result: serde_json::Value = match response.json().await { - Ok(data) => data, - Err(e) => { - return ApiResponse { - code: 500, - message: format!("解析上传响应失败: {}", e), - data: None, - }; - } - }; - - // 提取 URL - let url = result - .get("data") - .and_then(|d| d.get("url")) - .and_then(|u| u.as_str()) - .map(|s| s.to_string()); - - match url { - Some(url) => ApiResponse { - code: 200, - message: "上传成功".to_string(), - data: Some(UploadVideoResult { url }), - }, - None => ApiResponse { - code: 500, - message: "上传响应中未找到 URL".to_string(), - data: None, - }, - } + upload_file_to_backend( + &local_path, + "/upload/image", + "image.jpg", + mime_type, + "读取图片文件失败", + ).await } /// 下载文件请求参数 diff --git a/tauri-app/src-tauri/src/commands/voice.rs b/tauri-app/src-tauri/src/commands/voice.rs index a7a5200..77faae5 100644 --- a/tauri-app/src-tauri/src/commands/voice.rs +++ b/tauri-app/src-tauri/src/commands/voice.rs @@ -191,3 +191,66 @@ pub async fn get_project_audios_dir( }, } } + +// --------------------- 音频截取与上传 --------------------- + +#[derive(serde::Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ExtractAudioSegmentArgs { + pub input_path: String, + pub start: f64, + pub duration: f64, + pub output_path: String, +} + +/// 截取音频片段(FFmpeg) +#[tauri::command] +pub async fn extract_audio_segment( + app: tauri::AppHandle, + args: ExtractAudioSegmentArgs, +) -> ApiResponse { + match crate::ffmpeg_cmd::extract_audio_segment( + &app, + &args.input_path, + args.start, + args.duration, + &args.output_path, + ).await { + Ok(_) => ApiResponse { + code: 200, + message: "Audio segment extracted successfully".to_string(), + data: Some(args.output_path), + }, + Err(e) => ApiResponse { + code: 500, + message: format!("Failed to extract audio segment: {}", e), + data: None, + }, + } +} + +/// 上传本地音频文件到后端,后端上传到七牛云并返回 URL +#[tauri::command] +pub async fn upload_audio_file( + local_path: String, +) -> ApiResponse { + // 验证路径安全 + let safe_path = match crate::ffmpeg_cmd::sanitize_output_path(&local_path) { + Ok(p) => p, + Err(e) => { + return ApiResponse { + code: 500, + message: format!("路径验证失败: {}", e), + data: None, + }; + } + }; + + crate::commands::video_compose::upload_file_to_backend( + &safe_path, + "/upload/audio", + "audio.mp3", + "audio/mpeg", + "读取音频文件失败", + ).await +} diff --git a/tauri-app/src-tauri/src/ffmpeg_cmd.rs b/tauri-app/src-tauri/src/ffmpeg_cmd.rs index 32f9abe..cc81df4 100644 --- a/tauri-app/src-tauri/src/ffmpeg_cmd.rs +++ b/tauri-app/src-tauri/src/ffmpeg_cmd.rs @@ -129,7 +129,7 @@ pub async fn run_ffmpeg(app: &AppHandle, args: Vec) -> Result Result<(), String> { // 验证路径安全 @@ -138,14 +138,14 @@ pub async fn standardize_video(app: &AppHandle, input_path: &str, output_path: & let args = vec![ "-i".to_string(), safe_input, - "-vf".to_string(), "fps=30,scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2,format=yuv420p".to_string(), + "-vf".to_string(), "fps=25,scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2,format=yuv420p".to_string(), "-c:v".to_string(), "libx264".to_string(), "-c:a".to_string(), "aac".to_string(), "-ar".to_string(), "44100".to_string(), "-ac".to_string(), "2".to_string(), "-preset".to_string(), "veryfast".to_string(), "-crf".to_string(), "23".to_string(), - "-r".to_string(), "30".to_string(), + "-r".to_string(), "25".to_string(), "-y".to_string(), safe_output ]; @@ -233,7 +233,6 @@ pub async fn add_audio_to_video(app: &AppHandle, video_path: &str, audio_path: & "-ar".to_string(), "44100".to_string(), // 统一采样率 "-map".to_string(), "0:v:0".to_string(), "-map".to_string(), "1:a:0".to_string(), - "-shortest".to_string(), "-y".to_string(), safe_output ]; @@ -241,7 +240,7 @@ pub async fn add_audio_to_video(app: &AppHandle, video_path: &str, audio_path: & } /** - * 将封面图转换为一段短视频 (0.5s, 1080x1920, 30fps) + * 将封面图转换为一段短视频 (0.5s, 1080x1920, 25fps) * 带静音音频轨道,避免 concat 时丢失后续片段音频 */ pub async fn create_cover_video(app: &AppHandle, input_path: &str, output_path: &str, duration: &str) -> Result<(), String> { @@ -259,7 +258,7 @@ pub async fn create_cover_video(app: &AppHandle, input_path: &str, output_path: "-t".to_string(), duration.to_string(), "-pix_fmt".to_string(), "yuv420p".to_string(), "-vf".to_string(), "scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2,setsar=1".to_string(), - "-r".to_string(), "30".to_string(), + "-r".to_string(), "25".to_string(), "-shortest".to_string(), "-y".to_string(), safe_output @@ -487,7 +486,6 @@ pub async fn replace_audio_track( // 只保留第一个视频流和第一个音频流 "-map".to_string(), "0:v:0".to_string(), "-map".to_string(), "1:a:0".to_string(), - "-shortest".to_string(), "-y".to_string(), safe_output, ]; @@ -546,7 +544,7 @@ pub async fn mix_audio_tracks( /** * 裁剪视频片段(支持本地文件和 HTTP URL) * - * 从起始时间裁剪指定时长,同时标准化输出格式(1080x1920, 30fps, libx264, aac)。 + * 从起始时间裁剪指定时长,同时标准化输出格式(1080x1920, 25fps, libx264, aac)。 * 适用于从人物形象素材或空镜素材中提取指定时长的片段。 */ pub async fn clip_video( @@ -575,14 +573,14 @@ pub async fn clip_video( "-ss".to_string(), start_str, "-t".to_string(), duration_str, "-i".to_string(), safe_input, - "-vf".to_string(), "fps=30,scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2,format=yuv420p".to_string(), + "-vf".to_string(), "fps=25,scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2,format=yuv420p".to_string(), "-c:v".to_string(), "libx264".to_string(), "-preset".to_string(), "veryfast".to_string(), "-crf".to_string(), "23".to_string(), "-c:a".to_string(), "aac".to_string(), "-ar".to_string(), "44100".to_string(), "-ac".to_string(), "2".to_string(), - "-r".to_string(), "30".to_string(), + "-r".to_string(), "25".to_string(), "-pix_fmt".to_string(), "yuv420p".to_string(), "-avoid_negative_ts".to_string(), "make_zero".to_string(), "-y".to_string(), @@ -592,6 +590,39 @@ pub async fn clip_video( run_ffmpeg(app, args).await.map(|_| ()) } +/** + * 截取音频片段 + * + * 从指定起始时间截取指定时长的音频,输出为 MP3 格式。 + */ +pub async fn extract_audio_segment( + app: &AppHandle, + input_path: &str, + start: f64, + duration: f64, + output_path: &str, +) -> Result<(), String> { + let safe_input = validate_safe_path(input_path)?; + let safe_output = sanitize_output_path(output_path)?; + + let start_str = format!("{:.3}", start); + let duration_str = format!("{:.3}", duration); + + let args = vec![ + "-ss".to_string(), start_str, + "-t".to_string(), duration_str, + "-i".to_string(), safe_input, + "-c:a".to_string(), "libmp3lame".to_string(), + "-b:a".to_string(), "192k".to_string(), + "-ar".to_string(), "44100".to_string(), + "-ac".to_string(), "2".to_string(), + "-vn".to_string(), // 无视频 + "-y".to_string(), + safe_output, + ]; + run_ffmpeg(app, args).await.map(|_| ()) +} + /** * 转码音频为标准格式 (MP3 44.1kHz stereo 192kbps) */ diff --git a/tauri-app/src-tauri/src/lib.rs b/tauri-app/src-tauri/src/lib.rs index 7810313..f8af197 100644 --- a/tauri-app/src-tauri/src/lib.rs +++ b/tauri-app/src-tauri/src/lib.rs @@ -118,12 +118,15 @@ pub fn run() { commands::voice::list_project_audios, commands::voice::delete_audio, commands::voice::get_project_audios_dir, + commands::voice::extract_audio_segment, + commands::voice::upload_audio_file, // 音色素材库 commands::voice::load_voice_materials, commands::voice::save_voice_material, commands::voice::delete_voice_material_cmd, // 视频合成(Phase 2) commands::video_compose::compose_video, + commands::video_compose::extract_video_segment, commands::video_compose::upload_video_file, commands::video_compose::download_file, // 音频处理 diff --git a/tauri-app/src/api/modules/caption.ts b/tauri-app/src/api/modules/caption.ts new file mode 100644 index 0000000..1090ec2 --- /dev/null +++ b/tauri-app/src/api/modules/caption.ts @@ -0,0 +1,60 @@ +/** + * Caption 字幕 API 模块 + * ===================== + * + * 直接调用后端字幕相关 API(不走 Async Engine)。 + */ + +import { client } from '../client'; + +export interface CaptionUtterance { + text: string; + startTime: number; // 毫秒(client.ts 自动将后端 snake_case 转为 camelCase) + endTime: number; // 毫秒 +} + +export interface AutoAlignResult { + code: number; + message: string; + duration: number; // 秒 + utterances: CaptionUtterance[]; +} + +/** + * 自动字幕打轴(完整流程,同步阻塞) + * + * 为已有音频文本配上时间轴。后端内部轮询,最多等待 120 秒。 + * + * @param audioUrl 音频/视频文件 URL(七牛云) + * @param audioText 要打轴的完整字幕文本 + * @returns 打轴结果,含每句话的时间轴 + */ +export async function autoAlignCaption( + audioUrl: string, + audioText: string +): Promise { + // client.post 已自动提取 ApiResponse.data 并做 snakeToCamel 转换 + const result = await client.post<{ + code: number; + message: string; + duration: number; + utterances: CaptionUtterance[]; + }>('/caption/ata/align', { + audioUrl, + audioText, + captionType: 'speech', + staPuncMode: 3, + }); + + // result.code 是火山引擎打轴结果的状态码(0=成功, 2000=处理中) + if (result.code !== 0) { + throw new Error(result.message || '打轴失败'); + } + + return { + code: result.code, + message: result.message, + duration: result.duration, + utterances: result.utterances || [], + }; +} diff --git a/tauri-app/src/api/modules/localStorage.ts b/tauri-app/src/api/modules/localStorage.ts index 2fbf66e..a6dd7aa 100644 --- a/tauri-app/src/api/modules/localStorage.ts +++ b/tauri-app/src/api/modules/localStorage.ts @@ -67,6 +67,9 @@ export interface ProjectMeta { dubbingAudioUrl?: string; // 生成后的配音音频七牛云URL dubbingAudioPath?: string; // 生成后的配音音频本地路径 dubbingVoiceId?: string; // 生成配音使用的音色ID + voiceSpeed?: number; // 配音语速 + voiceVolume?: number; // 配音音量 + voicePitch?: number; // 配音音调 subtitleAlignment?: AlignmentResult; // 全局字幕打轴结果(单视频模式) burnedVideoPath?: string; // 压制字幕后的成品视频路径 coverConfig?: { @@ -120,6 +123,13 @@ export interface ProjectSegment { alignmentResult?: AlignmentResult; // 字幕打轴结果 burnedVideoPath?: string; // 压制字幕后的视频路径 burnedAt?: number; // 压制字幕的时间戳 + audioStartTime?: number; // 在完整配音音频中的开始时间(毫秒) + audioEndTime?: number; // 在完整配音音频中的结束时间(毫秒) + actualDuration?: number; // 实际时长(秒,基于字幕打轴) + clipAudioPath?: string; // 截取后的音频片段本地路径 + clipAudioUrl?: string; // 截取后的音频片段七牛云 URL + lipSyncTaskId?: string; // Vidu 对口型任务 ID + lipSyncState?: string; // Vidu 对口型任务状态 } /** @@ -146,6 +156,7 @@ export const localProjectApi = { selectedElementId: meta.selectedElementId, selectedVoiceId: meta.selectedVoiceId, composedVideoUrl: meta.composedVideoUrl, + composedVideoPath: meta.composedVideoPath, lipSyncTaskId: meta.lipSyncTaskId, lipSyncState: meta.lipSyncState, lipSyncedVideoPath: meta.lipSyncedVideoPath, @@ -153,6 +164,9 @@ export const localProjectApi = { dubbingAudioUrl: meta.dubbingAudioUrl, dubbingAudioPath: meta.dubbingAudioPath, dubbingVoiceId: meta.dubbingVoiceId, + voiceSpeed: meta.voiceSpeed, + voiceVolume: meta.voiceVolume, + voicePitch: meta.voicePitch, avatarMaterialPath: meta.avatarMaterialPath, avatarMaterialName: meta.avatarMaterialName, avatarMaterialDuration: meta.avatarMaterialDuration, @@ -199,6 +213,11 @@ export const localProjectApi = { alignmentResult: s.alignmentResult, burnedVideoPath: s.burnedVideoPath, burnedAt: s.burnedAt, + audioStartTime: s.audioStartTime, + audioEndTime: s.audioEndTime, + actualDuration: s.actualDuration, + clipAudioPath: s.clipAudioPath, + clipAudioUrl: s.clipAudioUrl, })); const jsonContent = JSON.stringify(orderedSegments, null, 2); const res = await safeInvoke>('save_project_segments_raw', { diff --git a/tauri-app/src/api/modules/videoCompose.ts b/tauri-app/src/api/modules/videoCompose.ts index 09181c5..e2d98d4 100644 --- a/tauri-app/src/api/modules/videoCompose.ts +++ b/tauri-app/src/api/modules/videoCompose.ts @@ -84,6 +84,22 @@ export async function uploadImageFile(localPath: string): Promise { return res.data!.url; } +/** + * 上传本地音频文件到后端(后端上传到七牛云) + * + * @param localPath 本地音频文件路径 + * @returns 七牛云 URL + */ +export async function uploadAudioFile(localPath: string): Promise { + const res = await invoke>('upload_audio_file', { + localPath, + }); + if (res.code !== 200) { + throw new Error(res.message); + } + return res.data!.url; +} + /** * 从 URL 下载文件到本地 * diff --git a/tauri-app/src/api/modules/voice.ts b/tauri-app/src/api/modules/voice.ts index e3465e4..9beadf1 100644 --- a/tauri-app/src/api/modules/voice.ts +++ b/tauri-app/src/api/modules/voice.ts @@ -319,3 +319,26 @@ export async function standardizeAudio(args: StandardizeAudioRequest): Promise { + const result = await invoke<{ code: number; data?: string; message: string }>('extract_audio_segment', { + args: { + inputPath: args.inputPath, + start: args.start, + duration: args.duration, + outputPath: args.outputPath, + }, + }); + if (result.code !== 200 || !result.data) { + throw new Error(result.message || '截取音频片段失败'); + } + return result.data; +} diff --git a/tauri-app/src/api/types.ts b/tauri-app/src/api/types.ts index 25c8914..b430ab5 100644 --- a/tauri-app/src/api/types.ts +++ b/tauri-app/src/api/types.ts @@ -59,4 +59,11 @@ export interface ScriptShot { burnedAt?: number; // 压制字幕的时间戳 audioPath?: string; // 本地配音音频文件路径 audioUrl?: string; // 七牛云配音音频 URL + audioStartTime?: number; // 在完整配音音频中的开始时间(毫秒) + audioEndTime?: number; // 在完整配音音频中的结束时间(毫秒) + actualDuration?: number; // 实际时长(秒,基于字幕打轴) + clipAudioPath?: string; // 截取后的音频片段本地路径 + clipAudioUrl?: string; // 截取后的音频片段七牛云 URL + lipSyncTaskId?: string; // Vidu 对口型任务 ID + lipSyncState?: string; // Vidu 对口型任务状态 } diff --git a/tauri-app/src/pages/VideoCreation/SubtitleBurning.tsx b/tauri-app/src/pages/VideoCreation/SubtitleBurning.tsx index 08b63ab..2b0a678 100644 --- a/tauri-app/src/pages/VideoCreation/SubtitleBurning.tsx +++ b/tauri-app/src/pages/VideoCreation/SubtitleBurning.tsx @@ -6,18 +6,18 @@ * 布局:左侧操作区 + 右侧预览区(使用 step-layout 标准布局) */ -import { useState, useEffect, useRef, useMemo } from 'react'; +import { useState, useRef, useMemo } from 'react'; import { invoke } from '@tauri-apps/api/core'; import { homeDir } from '@tauri-apps/api/path'; import { useProjectStore, saveMetaToLocalFile } from '../../store'; import { getCurrentProjectId } from '../../api/modules/localStorage'; -import { useTask } from '../../hooks/useTask'; + import { useLocalVideo } from '../../hooks/useLocalVideo'; import { useAssJsRenderer } from '../../hooks/useAssJsRenderer'; import { generateAssFromAlignment, saveAssFile, htmlColorToAss, applyAssJsCompensation } from '../../utils/assGenerator'; import { useProgressStore } from '../../store/progressStore'; import { toast } from '../../store/uiStore'; -import type { AlignmentResult } from '../../api/types'; + import './SubtitleBurning.css'; // 解析 Docker 容器内路径 (/root/Documents/...) 转换为本地用户路径 @@ -46,27 +46,26 @@ const SUBTITLE_PRESETS = [ ]; export default function SubtitleBurning() { - const segments = useProjectStore(state => state.segments); + const projectId = getCurrentProjectId(); - // 成品视频 - const lipSyncedVideoUrl = useProjectStore(state => state.lipSyncedVideoUrl); - const lipSyncedVideoPath = useProjectStore(state => state.lipSyncedVideoPath); + // 成品视频(临时:只用拼接视频,对口型替换验证通过后再启用) + const composedVideoUrl = useProjectStore(state => state.composedVideoUrl); + const composedVideoPath = useProjectStore(state => state.composedVideoPath); - // 打轴状态 - const storeAlignment = useProjectStore(state => state.subtitleAlignment); - const [alignment, setAlignment] = useState(storeAlignment); - const [isAligning, setIsAligning] = useState(false); + const actualVideoUrl = composedVideoUrl; + const actualVideoPath = composedVideoPath; + + // 打轴结果直接从 Step 2 复用(VoiceDubbing 已保存到 meta) + const alignment = useProjectStore(state => state.subtitleAlignment); const [isBurning, setIsBurning] = useState(false); - const { submit } = useTask(); - // 视频播放相关 const videoRef = useRef(null); const containerRef = useRef(null); - // 预览用七牛云 URL(加载快) - const { videoUrl: loadedVideoUrl } = useLocalVideo(lipSyncedVideoUrl); + // 预览用:优先 URL,否则回退到本地路径(useLocalVideo 支持本地路径读取) + const { videoUrl: loadedVideoUrl } = useLocalVideo(actualVideoUrl || actualVideoPath); // 字幕样式(默认值基于 1080x1920 视频) const [subtitleStyle, setSubtitleStyle] = useState({ @@ -125,13 +124,6 @@ export default function SubtitleBurning() { enabled: subtitleEnabled, }); - // 从 store 恢复打轴结果(页面刷新后) - useEffect(() => { - if (storeAlignment) { - setAlignment(storeAlignment); - } - }, [storeAlignment]); - // 应用预设样式 const applyPreset = (presetId: string) => { const preset = SUBTITLE_PRESETS.find(p => p.id === presetId); @@ -146,84 +138,13 @@ export default function SubtitleBurning() { }); }; - // 字幕打轴:对成品视频统一打轴 - const handleAlign = async () => { - if (!projectId) { - toast.error('项目ID不存在'); - return; - } - if (!lipSyncedVideoUrl) { - toast.error('请先完成视频生成'); - return; - } - - // 拼接所有分镜文案 - const audioText = segments.map(s => s.voiceover).filter(Boolean).join('\n'); - if (!audioText) { - toast.error('没有配音文案'); - return; - } - - setIsAligning(true); - useProgressStore.getState().show('字幕打轴'); - - const taskId = await submit( - 'subtitle', - { - videoPath: lipSyncedVideoUrl, - audioText, - mode: 'auto_align', - language: 'zh', - }, - { - showProgress: true, - callbacks: { - onComplete: (result: unknown) => { - const r = result as { - utterances?: Array<{ text: string; startTime: number; endTime: number }>; - duration?: number; - } | undefined; - - const newAlignment: AlignmentResult = { - status: 'completed', - utterances: r?.utterances?.map(u => ({ - text: u.text, - start_time: u.startTime, - end_time: u.endTime, - })), - duration: r?.duration, - }; - setAlignment(newAlignment); - useProjectStore.setState({ subtitleAlignment: newAlignment }); - saveMetaToLocalFile({ subtitleAlignment: newAlignment }); - setIsAligning(false); - }, - onError: (error: string) => { - const newAlignment: AlignmentResult = { - status: 'failed', - errorMessage: error, - }; - setAlignment(newAlignment); - useProjectStore.setState({ subtitleAlignment: newAlignment }); - saveMetaToLocalFile({ subtitleAlignment: newAlignment }); - setIsAligning(false); - }, - }, - } - ); - - if (!taskId) { - setIsAligning(false); - } - }; - // 压制字幕:单次压制全局字幕到成品视频 const handleBurn = async () => { if (!projectId) { toast.error('项目ID不存在'); return; } - if (!lipSyncedVideoPath) { + if (!actualVideoPath) { toast.error('成品视频不存在'); return; } @@ -265,7 +186,7 @@ export default function SubtitleBurning() { const outputPath = outputRes.data; // 4. 解析视频路径 - const resolvedVideoPath = await resolveHostPath(lipSyncedVideoPath); + const resolvedVideoPath = await resolveHostPath(actualVideoPath); // 5. 调用 Rust 压制字幕 const burnResult = await invoke<{ code: number; data?: string; message: string }>('burn_subtitle', { @@ -293,48 +214,10 @@ export default function SubtitleBurning() { } }; - // 打轴状态文本 - const alignmentStatusText = (() => { - if (!alignment) return '未打轴'; - switch (alignment.status) { - case 'pending': return '待打轴'; - case 'aligning': return '打轴中...'; - case 'completed': return '✓ 已打轴'; - case 'failed': return '✗ 打轴失败'; - default: return '未知'; - } - })(); - return (
{/* 左侧操作区 */}
- {/* 打轴区 */} -
-
- -
- -
-
-
- - {alignmentStatusText} - - {alignment?.status === 'failed' && alignment.errorMessage && ( - - {alignment.errorMessage} - - )} -
-
- {/* 字幕样式设置 */}
@@ -386,7 +269,7 @@ export default function SubtitleBurning() { @@ -409,14 +292,15 @@ export default function SubtitleBurning() { ) : ( <>