Files
NaviGlassServer/ai_voice_pipeline.py
2025-12-31 15:42:30 +08:00

155 lines
3.9 KiB
Python

# ai_voice_pipeline.py
# -*- coding: utf-8 -*-
"""
AI 语音交互管道 - Day 21
整合 SenseVoice + GLM-4.5-Flash + EdgeTTS
流程:
1. 客户端 VAD 检测语音结束
2. 发送完整音频到服务器
3. SenseVoice 识别 → GLM 生成回复 → EdgeTTS 合成语音
4. 流式返回 PCM 音频
"""
import asyncio
from typing import Optional, Callable, AsyncGenerator
# 导入各模块
from sensevoice_asr import recognize as asr_recognize, init_sensevoice
from glm_client import chat as llm_chat, chat_stream as llm_chat_stream
from edge_tts_client import (
text_to_speech_pcm_stream,
text_to_speech_pcm,
DEFAULT_VOICE,
)
async def init_pipeline():
"""初始化 AI 管道(服务器启动时调用)"""
await init_sensevoice()
print("[AI Pipeline] 初始化完成")
async def process_voice(
pcm_audio: bytes,
image_base64: Optional[str] = None,
on_text: Optional[Callable[[str], None]] = None,
on_audio: Optional[Callable[[bytes], None]] = None,
) -> str:
"""
处理语音输入,返回 AI 回复
Args:
pcm_audio: PCM16 音频数据 (16kHz, mono)
image_base64: 可选的图片(用于多模态)
on_text: 文本回调(用于 UI 显示)
on_audio: 音频回调(用于流式播放)
Returns:
AI 回复文本
"""
# 1. 语音识别
user_text = await asr_recognize(pcm_audio)
if not user_text:
print("[AI Pipeline] 未识别到有效语音")
return ""
print(f"[AI Pipeline] 用户说: {user_text}")
# 通知 UI
if on_text:
on_text(f"用户: {user_text}")
# 2. LLM 生成回复
ai_response = await llm_chat(user_text, image_base64)
if not ai_response:
print("[AI Pipeline] AI 无回复")
return ""
print(f"[AI Pipeline] AI 回复: {ai_response}")
# 通知 UI
if on_text:
on_text(f"AI: {ai_response}")
# 3. TTS 合成并播放
if on_audio:
async for audio_chunk in text_to_speech_pcm_stream(ai_response):
on_audio(audio_chunk)
return ai_response
async def process_voice_stream(
pcm_audio: bytes,
image_base64: Optional[str] = None,
) -> AsyncGenerator[tuple, None]:
"""
流式处理语音输入
Args:
pcm_audio: PCM16 音频数据
image_base64: 可选的图片
Yields:
("text", str) - 文本片段
("audio", bytes) - 音频片段
"""
# 1. 语音识别
user_text = await asr_recognize(pcm_audio)
if not user_text:
return
yield ("user_text", user_text)
# 2. LLM 流式生成 + 3. TTS 流式合成
# 收集一定长度的文本后送 TTS
buffer = ""
punctuation = "。,!?;:,.!?;:"
async for text_chunk in llm_chat_stream(user_text, image_base64):
yield ("ai_text", text_chunk)
buffer += text_chunk
# 遇到标点时合成音频
if buffer and buffer[-1] in punctuation:
async for audio_chunk in text_to_speech_pcm_stream(buffer):
yield ("audio", audio_chunk)
buffer = ""
# 处理剩余文本
if buffer.strip():
async for audio_chunk in text_to_speech_pcm_stream(buffer):
yield ("audio", audio_chunk)
async def text_to_voice(text: str) -> bytes:
"""
文本转语音(用于导航提示等)
Args:
text: 要合成的文本
Returns:
PCM16 音频数据
"""
return await text_to_speech_pcm(text)
async def text_to_voice_stream(text: str) -> AsyncGenerator[bytes, None]:
"""
流式文本转语音
Args:
text: 要合成的文本
Yields:
PCM16 音频块
"""
async for chunk in text_to_speech_pcm_stream(text):
yield chunk