From abf005f2252805aa40bb917c1a0a2cfa5cb54812 Mon Sep 17 00:00:00 2001 From: Kevin Wong Date: Sat, 28 Feb 2026 17:49:32 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Docs/COSYVOICE3_DEPLOY.md | 12 +++ Docs/DevLogs/Day29.md | 81 +++++++++++++++++- Docs/FRONTEND_README.md | 1 + Docs/task_complete.md | 19 ++++- README.md | 2 +- .../app/modules/generated_audios/schemas.py | 1 + .../app/modules/generated_audios/service.py | 1 + backend/app/services/voice_clone_service.py | 18 ++-- .../features/home/model/useGeneratedAudios.ts | 1 + .../features/home/model/useHomeController.ts | 15 ++++ .../features/home/model/useHomePersistence.ts | 14 ++++ .../features/home/ui/GeneratedAudiosPanel.tsx | 83 ++++++++++++++++++- frontend/src/features/home/ui/HomePage.tsx | 4 + models/CosyVoice/cosyvoice_server.py | 31 +++++-- 14 files changed, 261 insertions(+), 22 deletions(-) diff --git a/Docs/COSYVOICE3_DEPLOY.md b/Docs/COSYVOICE3_DEPLOY.md index 4318a53..8b8834d 100644 --- a/Docs/COSYVOICE3_DEPLOY.md +++ b/Docs/COSYVOICE3_DEPLOY.md @@ -70,6 +70,18 @@ run_cosyvoice.sh # PM2 启动脚本 | ref_text | string | 是 | 参考音频的转写文字 | | language | string | 否 | 语言 (默认 "Chinese",CosyVoice 自动检测) | | speed | float | 否 | 语速 (默认 1.0,范围 0.5-2.0,建议 0.8-1.2) | +| instruct_text | string | 否 | 语气指令 (默认 "",非空时切换为 `inference_instruct2` 模式) | + +**推理模式分支:** +- `instruct_text` 为空 → `inference_zero_shot(text, prompt_text, ref_audio)` — 纯声音克隆 +- `instruct_text` 非空 → `inference_instruct2(text, instruct_text, ref_audio)` — 带语气/情绪控制的声音克隆 + +**支持的语气指令示例:** +``` +"You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>" +"You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>" +"You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>" +``` **返回:** WAV 音频文件 diff --git a/Docs/DevLogs/Day29.md b/Docs/DevLogs/Day29.md index 38d9bae..224e863 100644 --- a/Docs/DevLogs/Day29.md +++ b/Docs/DevLogs/Day29.md @@ -1,8 +1,8 @@ -## 字幕同步修复 + 嘴型参数调优 + 视频流水线全面优化 + 预览背景修复 (Day 29) +## 字幕同步修复 + 嘴型参数调优 + 视频流水线全面优化 + 预览背景修复 + CosyVoice 语气控制 (Day 29) ### 概述 -本轮对视频生成流水线做全面审查优化:修复字幕与语音不同步问题(Whisper 时间戳平滑 + 原文节奏映射)、调优 LatentSync 嘴型参数、compose 流复制省去冗余重编码、FFmpeg 超时保护、全局并发限制、Redis 任务 TTL、临时文件清理、死代码移除。同时修复因前端域名迁移(`vigent.hbyrkj.top` → `ipagent.ai-labz.cn`)导致的样式预览背景 CORS 失效问题。 +本轮对视频生成流水线做全面审查优化:修复字幕与语音不同步问题(Whisper 时间戳平滑 + 原文节奏映射)、调优 LatentSync 嘴型参数、compose 流复制省去冗余重编码、FFmpeg 超时保护、全局并发限制、Redis 任务 TTL、临时文件清理、死代码移除。修复因前端域名迁移导致的样式预览背景 CORS 失效问题。新增 CosyVoice 语气控制功能,声音克隆模式下支持开心/伤心/生气等情绪表达(基于 `inference_instruct2`)。 --- @@ -194,6 +194,79 @@ ALIPAY_RETURN_URL=https://ipagent.ai-labz.cn/pay --- +### 12. CosyVoice 语气控制功能 + +- **功能**: 声音克隆模式下新增"语气"下拉菜单(正常/欢快/低沉/严肃),利用 CosyVoice3 的 `inference_instruct2()` 方法通过自然语言指令控制语气情绪 +- **默认行为不变**: 选择"正常"时仍走 `inference_zero_shot()`,与改动前完全一致 + +#### 数据流 + +``` +用户选择语气 → setEmotion("happy") → localStorage 持久化 + → 生成配音 → emotion 映射为 instruct_text + → POST /api/generated-audios/generate { instruct_text } + → voice_clone_service → POST localhost:8010/generate { instruct_text } + → instruct_text 非空 ? inference_instruct2() : inference_zero_shot() +``` + +#### CosyVoice 服务 — `cosyvoice_server.py` + +- `/generate` 端点新增 `instruct_text: str = Form("")` 参数 +- 推理分支:空 → `inference_zero_shot()`,非空 → `inference_instruct2(text, instruct_text, ref_audio_path, ...)` +- `inference_instruct2` 不需要 `prompt_text`,直接接受 `instruct_text` + `prompt_wav` + +#### 后端透传 + +- `schemas.py`: `GenerateAudioRequest` 新增 `instruct_text: Optional[str] = None` +- `service.py`: `generate_audio_task()` voiceclone 分支传递 `instruct_text=req.instruct_text or ""` +- `voice_clone_service.py`: `_generate_once()` 和 `generate_audio()` 新增 `instruct_text` 参数 + +#### 前端 + +- `useHomeController.ts`: 新增 `emotion` state + `emotionToInstruct` 映射表 +- `useHomePersistence.ts`: 语气选择持久化到 localStorage +- `useGeneratedAudios.ts`: `generateAudio` params 新增 `instruct_text` +- `GeneratedAudiosPanel.tsx`: 语气下拉菜单(语速按钮左侧),复用语速下拉样式,仅 voiceclone 模式可见 +- `HomePage.tsx`: 透传 `emotion`/`onEmotionChange` + +#### instruct_text 格式(来自 CosyVoice3 instruct_list) + +``` +正常: ""(走 inference_zero_shot) +欢快: "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>" +低沉: "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>" +严肃: "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>" +``` + +--- + +## 📁 修改文件清单 + +| 文件 | 改动 | +|------|------| +| `backend/app/services/whisper_service.py` | 时间戳平滑 + 原文节奏映射 + 单字时长钳位 | +| `remotion/src/utils/captions.ts` | 新增 `getCurrentSegment` / `getCurrentWordIndex` | +| `backend/app/services/video_service.py` | compose 流复制 + FFmpeg 超时保护 | +| `backend/app/modules/videos/workflow.py` | Semaphore(2) 并发限制 + 字体清理 + Whisper 逻辑去重 | +| `backend/app/modules/videos/task_store.py` | Redis TTL + 索引过期清理 | +| `backend/app/services/lipsync_service.py` | 删除 `_preprocess_video()` 死代码 | +| `backend/app/services/remotion_service.py` | concurrency 16 → 4 | +| `remotion/render.ts` | 新增 concurrency 参数支持 | +| `backend/app/modules/materials/router.py` | 新增 `/stream/{material_id}` 同源代理端点 | +| `frontend/.../useVideoFrameCapture.ts` | 移除 crossOrigin | +| `frontend/.../useHomeController.ts` | 帧截取 URL 改用同源代理 + emotion state + emotionToInstruct 映射 | +| `backend/.env` | 嘴型参数 + 支付宝域名更新 | +| `models/CosyVoice/cosyvoice_server.py` | `/generate` 新增 `instruct_text` 参数,分支 `inference_instruct2` / `inference_zero_shot` | +| `backend/app/services/voice_clone_service.py` | `_generate_once` / `generate_audio` 新增 `instruct_text` 透传 | +| `backend/app/modules/generated_audios/schemas.py` | `GenerateAudioRequest` 新增 `instruct_text` 字段 | +| `backend/app/modules/generated_audios/service.py` | voiceclone 分支传递 `instruct_text` | +| `frontend/.../useGeneratedAudios.ts` | `generateAudio` params 新增 `instruct_text` | +| `frontend/.../useHomePersistence.ts` | emotion 持久化 (localStorage) | +| `frontend/.../GeneratedAudiosPanel.tsx` | 语气下拉菜单 UI (embedded + standalone) | +| `frontend/.../HomePage.tsx` | 透传 emotion / onEmotionChange | + +--- + ## 🔍 验证 1. **字幕同步**: 生成视频观察逐字高亮,不应出现超前/滞后/跳空 @@ -204,3 +277,7 @@ ALIPAY_RETURN_URL=https://ipagent.ai-labz.cn/pay 6. **字体清理**: 生成视频后 temp 目录不应残留字体文件 7. **预览背景**: 选择素材 → 点击"预览样式",应显示视频第一帧(非渐变) 8. **支付宝**: 发起支付后回调和跳转地址为新域名 +9. **语气控制**: 声音克隆模式选择"开心"/"生气"生成配音,CosyVoice 日志出现 `🎭 Instruct mode`,音频语气有明显变化 +10. **语气默认**: 选择"正常"时行为与改动前完全相同(走 `inference_zero_shot`) +11. **语气持久化**: 切换语气后刷新页面,下拉菜单恢复上次选择 +12. **语气可见性**: 语气下拉仅在 voiceclone 模式显示,edgetts 模式不显示 diff --git a/Docs/FRONTEND_README.md b/Docs/FRONTEND_README.md index cad4339..ff197eb 100644 --- a/Docs/FRONTEND_README.md +++ b/Docs/FRONTEND_README.md @@ -37,6 +37,7 @@ ViGent2 的前端界面,采用 Next.js 16 + TailwindCSS 构建。 - **重新识别**: 旧参考音频可重新转写并截取 (RotateCw 按钮)。 - **一键克隆**: 选择参考音频后自动调用 CosyVoice 3.0 服务。 - **语速控制**: 声音克隆模式下支持 5 档语速 (0.8-1.2),选择持久化 (Day 23)。 +- **语气控制**: 声音克隆模式下支持 4 种语气 (正常/欢快/低沉/严肃),基于 CosyVoice3 `inference_instruct2`,选择持久化 (Day 29)。 - **多语言支持**: EdgeTTS 10 语言声音列表,声音克隆 language 透传 (Day 22)。 ### 4. 配音前置 + 时间轴编排 [Day 23 新增] diff --git a/Docs/task_complete.md b/Docs/task_complete.md index 0d4a85d..dbf67d3 100644 --- a/Docs/task_complete.md +++ b/Docs/task_complete.md @@ -1,8 +1,8 @@ # ViGent2 开发任务清单 (Task Log) **项目**: ViGent2 数字人口播视频生成系统 -**进度**: 100% (Day 28 - CosyVoice FP16 加速 + 文档全面更新) -**更新时间**: 2026-02-27 +**进度**: 100% (Day 29 - 视频流水线优化 + CosyVoice 语气控制) +**更新时间**: 2026-02-28 --- @@ -10,7 +10,18 @@ > 这里记录了每一天的核心开发内容与 milestone。 -### Day 28: CosyVoice FP16 加速 + 文档全面更新 (Current) +### Day 29: 视频流水线优化 + CosyVoice 语气控制 (Current) +- [x] **字幕同步修复**: Whisper 时间戳三步平滑(单调递增+重叠消除+间隙填补)+ 原文节奏映射(线性插值 + 单字时长钳位)。 +- [x] **LatentSync 嘴型参数调优**: inference_steps 16→20, guidance_scale 2.0, DeepCache 启用, Remotion concurrency 16→4。 +- [x] **compose 流复制**: 不循环时 `-c:v copy` 替代 libx264 重编码,compose 耗时从分钟级降到秒级。 +- [x] **FFmpeg 超时保护**: `_run_ffmpeg()` timeout=600, `_get_duration()` timeout=30。 +- [x] **全局并发限制**: `asyncio.Semaphore(2)` 控制同时运行的生成任务数。 +- [x] **Redis 任务 TTL**: create 24h, completed/failed 2h, list 自动清理过期索引。 +- [x] **临时字体清理**: 字体文件加入 temp_files 清理列表。 +- [x] **预览背景 CORS 修复**: 素材同源代理 `/api/materials/stream/{id}` 彻底绕开跨域。 +- [x] **CosyVoice 语气控制**: 声音克隆模式新增语气下拉(正常/欢快/低沉/严肃),基于 `inference_instruct2()` 自然语言指令控制情绪,全链路透传 instruct_text,默认"正常"行为不变。 + +### Day 28: CosyVoice FP16 加速 + 文档全面更新 - [x] **CosyVoice FP16 半精度加速**: `AutoModel()` 开启 `fp16=True`,LLM 推理和 Flow Matching 自动混合精度运行,预估提速 30-40%、显存降低 ~30%。 - [x] **文档全面更新**: README.md / DEPLOY_MANUAL.md / SUBTITLE_DEPLOY.md / BACKEND_README.md 补充 MuseTalk 混合唇形同步方案、性能优化、Remotion 并发渲染等内容。 @@ -258,7 +269,7 @@ | **核心 API** | 100% | ✅ 稳定 | | **Web UI** | 100% | ✅ 稳定 (移动端适配) | | **唇形同步** | 100% | ✅ LatentSync 1.6 | -| **TTS 配音** | 100% | ✅ EdgeTTS + CosyVoice 3.0 + 配音前置 + 时间轴编排 + 自动转写 + 语速控制 | +| **TTS 配音** | 100% | ✅ EdgeTTS + CosyVoice 3.0 + 配音前置 + 时间轴编排 + 自动转写 + 语速控制 + 语气控制 | | **自动发布** | 100% | ✅ 抖音/微信视频号/B站/小红书 | | **用户认证** | 100% | ✅ 手机号 + JWT | | **付费会员** | 100% | ✅ 支付宝电脑网站支付 + 自动激活 | diff --git a/README.md b/README.md index 0e65d65..e652bdb 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ### 核心能力 - 🎬 **高清唇形同步** - 混合方案:短视频 (<120s) 用 LatentSync 1.6 (高质量 Latent Diffusion),长视频 (>=120s) 用 MuseTalk 1.5 (实时级单步推理),自动路由 + 回退。 -- 🎙️ **多模态配音** - 支持 **EdgeTTS** (微软超自然语音, 10 语言) 和 **CosyVoice 3.0** (3秒极速声音克隆, 9语言+18方言, 语速可调)。上传参考音频自动 Whisper 转写 + 智能截取。配音前置工作流:先生成配音 → 选素材 → 生成视频。 +- 🎙️ **多模态配音** - 支持 **EdgeTTS** (微软超自然语音, 10 语言) 和 **CosyVoice 3.0** (3秒极速声音克隆, 9语言+18方言, 语速/语气可调)。上传参考音频自动 Whisper 转写 + 智能截取。配音前置工作流:先生成配音 → 选素材 → 生成视频。 - 📝 **智能字幕** - 集成 faster-whisper + Remotion,自动生成逐字高亮 (卡拉OK效果) 字幕。 - 🎨 **样式预设** - 12 种标题 + 8 种字幕样式预设,支持预览 + 字号调节 + 自定义字体库。CSS 原生描边渲染,清晰无重影。 - 🏷️ **标题显示模式** - 片头标题支持 `短暂显示` / `常驻显示`,默认短暂显示(4秒),用户偏好自动持久化。 diff --git a/backend/app/modules/generated_audios/schemas.py b/backend/app/modules/generated_audios/schemas.py index 30311f5..8f3c29f 100644 --- a/backend/app/modules/generated_audios/schemas.py +++ b/backend/app/modules/generated_audios/schemas.py @@ -10,6 +10,7 @@ class GenerateAudioRequest(BaseModel): ref_text: Optional[str] = None language: str = "zh-CN" speed: float = 1.0 + instruct_text: Optional[str] = None class RenameAudioRequest(BaseModel): diff --git a/backend/app/modules/generated_audios/service.py b/backend/app/modules/generated_audios/service.py index 38263de..dc6e88a 100644 --- a/backend/app/modules/generated_audios/service.py +++ b/backend/app/modules/generated_audios/service.py @@ -81,6 +81,7 @@ async def generate_audio_task(task_id: str, req: GenerateAudioRequest, user_id: output_path=audio_path, language=_locale_to_tts_lang(req.language), speed=req.speed, + instruct_text=req.instruct_text or "", ) finally: if os.path.exists(ref_local): diff --git a/backend/app/services/voice_clone_service.py b/backend/app/services/voice_clone_service.py index 9dbb19b..a376450 100644 --- a/backend/app/services/voice_clone_service.py +++ b/backend/app/services/voice_clone_service.py @@ -32,6 +32,7 @@ class VoiceCloneService: ref_text: str, language: str, speed: float = 1.0, + instruct_text: str = "", max_retries: int = 4, ) -> bytes: timeout = httpx.Timeout(240.0) @@ -39,15 +40,18 @@ class VoiceCloneService: for attempt in range(max_retries): try: async with httpx.AsyncClient(timeout=timeout) as client: + data = { + "text": text, + "ref_text": ref_text, + "language": language, + "speed": str(speed), + } + if instruct_text: + data["instruct_text"] = instruct_text response = await client.post( f"{self.base_url}/generate", files={"ref_audio": ("ref.wav", ref_audio_data, "audio/wav")}, - data={ - "text": text, - "ref_text": ref_text, - "language": language, - "speed": str(speed), - }, + data=data, ) retryable = False @@ -99,6 +103,7 @@ class VoiceCloneService: output_path: str, language: str = "Chinese", speed: float = 1.0, + instruct_text: str = "", ) -> str: """ 使用声音克隆生成语音 @@ -132,6 +137,7 @@ class VoiceCloneService: ref_text=ref_text, language=language, speed=speed, + instruct_text=instruct_text, ) with open(output_path, "wb") as f: f.write(audio_bytes) diff --git a/frontend/src/features/home/model/useGeneratedAudios.ts b/frontend/src/features/home/model/useGeneratedAudios.ts index 27189e6..c37211c 100644 --- a/frontend/src/features/home/model/useGeneratedAudios.ts +++ b/frontend/src/features/home/model/useGeneratedAudios.ts @@ -127,6 +127,7 @@ export const useGeneratedAudios = ({ ref_text?: string; language: string; speed?: number; + instruct_text?: string; }) => { setIsGeneratingAudio(true); setAudioTask({ status: "pending", progress: 0, message: "正在提交..." }); diff --git a/frontend/src/features/home/model/useHomeController.ts b/frontend/src/features/home/model/useHomeController.ts index 328aff7..7d58b0f 100644 --- a/frontend/src/features/home/model/useHomeController.ts +++ b/frontend/src/features/home/model/useHomeController.ts @@ -182,6 +182,9 @@ export const useHomeController = () => { // 语速控制 const [speed, setSpeed] = useState(1.0); + // 语气控制(仅声音克隆模式) + const [emotion, setEmotion] = useState("normal"); + // ClipTrimmer 模态框状态 const [clipTrimmerOpen, setClipTrimmerOpen] = useState(false); const [clipTrimmerSegmentId, setClipTrimmerSegmentId] = useState(null); @@ -502,6 +505,8 @@ export const useHomeController = () => { setSelectedAudioId, speed, setSpeed, + emotion, + setEmotion, }); const { savedScripts, saveScript, deleteScript: deleteSavedScript } = useSavedScripts(storageKey); @@ -876,6 +881,13 @@ export const useHomeController = () => { return; } + const emotionToInstruct: Record = { + normal: "", + happy: "You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>", + sad: "You are a helpful assistant. 请非常伤心地说一句话。<|endofprompt|>", + angry: "You are a helpful assistant. 请非常生气地说一句话。<|endofprompt|>", + }; + const params = { text: text.trim(), tts_mode: ttsMode, @@ -884,6 +896,7 @@ export const useHomeController = () => { ref_text: ttsMode === "voiceclone" ? refText : undefined, language: textLang, speed: ttsMode === "voiceclone" ? speed : undefined, + instruct_text: ttsMode === "voiceclone" ? emotionToInstruct[emotion] || "" : undefined, }; await generateAudio(params); }; @@ -1215,6 +1228,8 @@ export const useHomeController = () => { selectAudio, speed, setSpeed, + emotion, + setEmotion, timelineSegments, reorderSegments, setSourceRange, diff --git a/frontend/src/features/home/model/useHomePersistence.ts b/frontend/src/features/home/model/useHomePersistence.ts index 9fde59c..b2ba41f 100644 --- a/frontend/src/features/home/model/useHomePersistence.ts +++ b/frontend/src/features/home/model/useHomePersistence.ts @@ -65,6 +65,8 @@ interface UseHomePersistenceOptions { setSelectedAudioId: React.Dispatch>; speed: number; setSpeed: React.Dispatch>; + emotion: string; + setEmotion: React.Dispatch>; } export const useHomePersistence = ({ @@ -122,6 +124,8 @@ export const useHomePersistence = ({ setSelectedAudioId, speed, setSpeed, + emotion, + setEmotion, }: UseHomePersistenceOptions) => { const [isRestored, setIsRestored] = useState(false); @@ -153,6 +157,7 @@ export const useHomePersistence = ({ const savedSubtitleBottomMargin = localStorage.getItem(`vigent_${storageKey}_subtitleBottomMargin`); const savedOutputAspectRatio = localStorage.getItem(`vigent_${storageKey}_outputAspectRatio`); const savedSpeed = localStorage.getItem(`vigent_${storageKey}_speed`); + const savedEmotion = localStorage.getItem(`vigent_${storageKey}_emotion`); setText(savedText || "大家好,欢迎来到我的频道,今天给大家分享一些有趣的内容。"); setVideoTitle(savedTitle ? clampTitle(savedTitle) : ""); @@ -235,6 +240,8 @@ export const useHomePersistence = ({ if (!Number.isNaN(parsed)) setSpeed(parsed); } + if (savedEmotion) setEmotion(savedEmotion); + // eslint-disable-next-line react-hooks/set-state-in-effect setIsRestored(true); }, [ @@ -249,6 +256,7 @@ export const useHomePersistence = ({ setSelectedVideoId, setSelectedAudioId, setSpeed, + setEmotion, setSubtitleFontSize, setSubtitleSizeLocked, setText, @@ -427,5 +435,11 @@ export const useHomePersistence = ({ } }, [speed, storageKey, isRestored]); + useEffect(() => { + if (isRestored) { + localStorage.setItem(`vigent_${storageKey}_emotion`, emotion); + } + }, [emotion, storageKey, isRestored]); + return { isRestored }; }; diff --git a/frontend/src/features/home/ui/GeneratedAudiosPanel.tsx b/frontend/src/features/home/ui/GeneratedAudiosPanel.tsx index c2e901a..5b52171 100644 --- a/frontend/src/features/home/ui/GeneratedAudiosPanel.tsx +++ b/frontend/src/features/home/ui/GeneratedAudiosPanel.tsx @@ -23,6 +23,8 @@ interface GeneratedAudiosPanelProps { speed: number; onSpeedChange: (speed: number) => void; ttsMode: string; + emotion: string; + onEmotionChange: (e: string) => void; embedded?: boolean; } @@ -41,14 +43,18 @@ export function GeneratedAudiosPanel({ speed, onSpeedChange, ttsMode, + emotion, + onEmotionChange, embedded = false, }: GeneratedAudiosPanelProps) { const [editingId, setEditingId] = useState(null); const [editName, setEditName] = useState(""); const [playingId, setPlayingId] = useState(null); const [speedOpen, setSpeedOpen] = useState(false); + const [emotionOpen, setEmotionOpen] = useState(false); const audioRef = useRef(null); const speedRef = useRef(null); + const emotionRef = useRef(null); const stopPlaying = useCallback(() => { if (audioRef.current) { @@ -80,6 +86,17 @@ export function GeneratedAudiosPanel({ return () => document.removeEventListener("mousedown", handler); }, [speedOpen]); + // Close emotion dropdown on click outside + useEffect(() => { + const handler = (e: MouseEvent) => { + if (emotionRef.current && !emotionRef.current.contains(e.target as Node)) { + setEmotionOpen(false); + } + }; + if (emotionOpen) document.addEventListener("mousedown", handler); + return () => document.removeEventListener("mousedown", handler); + }, [emotionOpen]); + const togglePlay = (audio: GeneratedAudio, e: React.MouseEvent) => { e.stopPropagation(); if (playingId === audio.id) { @@ -125,12 +142,48 @@ export function GeneratedAudiosPanel({ ] as const; const currentSpeedLabel = speedOptions.find((o) => o.value === speed)?.label ?? "正常"; + const emotionOptions = [ + { value: "normal", label: "正常" }, + { value: "happy", label: "欢快" }, + { value: "sad", label: "低沉" }, + { value: "angry", label: "严肃" }, + ] as const; + const currentEmotionLabel = emotionOptions.find((o) => o.value === emotion)?.label ?? "正常"; + const content = ( <> {embedded ? ( <> - {/* Row 1: 语速 + 生成配音 (right-aligned) */} + {/* Row 1: 语气 + 语速 + 生成配音 (right-aligned) */}
+ {ttsMode === "voiceclone" && ( +
+ + {emotionOpen && ( +
+ {emotionOptions.map((opt) => ( + + ))} +
+ )} +
+ )} {ttsMode === "voiceclone" && (
+ {emotionOpen && ( +
+ {emotionOptions.map((opt) => ( + + ))} +
+ )} +
+ )} {ttsMode === "voiceclone" && (
diff --git a/models/CosyVoice/cosyvoice_server.py b/models/CosyVoice/cosyvoice_server.py index ce2425f..168bfcc 100644 --- a/models/CosyVoice/cosyvoice_server.py +++ b/models/CosyVoice/cosyvoice_server.py @@ -174,6 +174,7 @@ async def generate( ref_text: str = Form(...), language: str = Form("Chinese"), speed: float = Form(1.0), + instruct_text: str = Form(""), ): """ 声音克隆生成 @@ -236,16 +237,30 @@ async def generate( # CosyVoice3 的 prompt_text 格式 prompt_text = f"You are a helpful assistant.<|endofprompt|>{ref_text}" + use_instruct = bool(instruct_text.strip()) + if use_instruct: + print(f"🎭 Instruct mode: {instruct_text[:60]}...") + def _do_inference(): """在线程池中执行推理""" - results = list(_model.inference_zero_shot( - text, - prompt_text, - ref_audio_path, - stream=False, - speed=speed, - text_frontend=True, - )) + if use_instruct: + results = list(_model.inference_instruct2( + text, + instruct_text, + ref_audio_path, + stream=False, + speed=speed, + text_frontend=True, + )) + else: + results = list(_model.inference_zero_shot( + text, + prompt_text, + ref_audio_path, + stream=False, + speed=speed, + text_frontend=True, + )) if not results: raise RuntimeError("CosyVoice returned empty results")