155 lines
3.9 KiB
Python
155 lines
3.9 KiB
Python
# ai_voice_pipeline.py
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
AI 语音交互管道 - Day 21
|
|
|
|
整合 SenseVoice + GLM-4.5-Flash + EdgeTTS
|
|
|
|
流程:
|
|
1. 客户端 VAD 检测语音结束
|
|
2. 发送完整音频到服务器
|
|
3. SenseVoice 识别 → GLM 生成回复 → EdgeTTS 合成语音
|
|
4. 流式返回 PCM 音频
|
|
"""
|
|
|
|
import asyncio
|
|
from typing import Optional, Callable, AsyncGenerator
|
|
|
|
# 导入各模块
|
|
from sensevoice_asr import recognize as asr_recognize, init_sensevoice
|
|
from glm_client import chat as llm_chat, chat_stream as llm_chat_stream
|
|
from edge_tts_client import (
|
|
text_to_speech_pcm_stream,
|
|
text_to_speech_pcm,
|
|
DEFAULT_VOICE,
|
|
)
|
|
|
|
|
|
async def init_pipeline():
|
|
"""初始化 AI 管道(服务器启动时调用)"""
|
|
await init_sensevoice()
|
|
print("[AI Pipeline] 初始化完成")
|
|
|
|
|
|
async def process_voice(
|
|
pcm_audio: bytes,
|
|
image_base64: Optional[str] = None,
|
|
on_text: Optional[Callable[[str], None]] = None,
|
|
on_audio: Optional[Callable[[bytes], None]] = None,
|
|
) -> str:
|
|
"""
|
|
处理语音输入,返回 AI 回复
|
|
|
|
Args:
|
|
pcm_audio: PCM16 音频数据 (16kHz, mono)
|
|
image_base64: 可选的图片(用于多模态)
|
|
on_text: 文本回调(用于 UI 显示)
|
|
on_audio: 音频回调(用于流式播放)
|
|
|
|
Returns:
|
|
AI 回复文本
|
|
"""
|
|
# 1. 语音识别
|
|
user_text = await asr_recognize(pcm_audio)
|
|
|
|
if not user_text:
|
|
print("[AI Pipeline] 未识别到有效语音")
|
|
return ""
|
|
|
|
print(f"[AI Pipeline] 用户说: {user_text}")
|
|
|
|
# 通知 UI
|
|
if on_text:
|
|
on_text(f"用户: {user_text}")
|
|
|
|
# 2. LLM 生成回复
|
|
ai_response = await llm_chat(user_text, image_base64)
|
|
|
|
if not ai_response:
|
|
print("[AI Pipeline] AI 无回复")
|
|
return ""
|
|
|
|
print(f"[AI Pipeline] AI 回复: {ai_response}")
|
|
|
|
# 通知 UI
|
|
if on_text:
|
|
on_text(f"AI: {ai_response}")
|
|
|
|
# 3. TTS 合成并播放
|
|
if on_audio:
|
|
async for audio_chunk in text_to_speech_pcm_stream(ai_response):
|
|
on_audio(audio_chunk)
|
|
|
|
return ai_response
|
|
|
|
|
|
async def process_voice_stream(
|
|
pcm_audio: bytes,
|
|
image_base64: Optional[str] = None,
|
|
) -> AsyncGenerator[tuple, None]:
|
|
"""
|
|
流式处理语音输入
|
|
|
|
Args:
|
|
pcm_audio: PCM16 音频数据
|
|
image_base64: 可选的图片
|
|
|
|
Yields:
|
|
("text", str) - 文本片段
|
|
("audio", bytes) - 音频片段
|
|
"""
|
|
# 1. 语音识别
|
|
user_text = await asr_recognize(pcm_audio)
|
|
|
|
if not user_text:
|
|
return
|
|
|
|
yield ("user_text", user_text)
|
|
|
|
# 2. LLM 流式生成 + 3. TTS 流式合成
|
|
# 收集一定长度的文本后送 TTS
|
|
buffer = ""
|
|
punctuation = "。,!?;:,.!?;:"
|
|
|
|
async for text_chunk in llm_chat_stream(user_text, image_base64):
|
|
yield ("ai_text", text_chunk)
|
|
buffer += text_chunk
|
|
|
|
# 遇到标点时合成音频
|
|
if buffer and buffer[-1] in punctuation:
|
|
async for audio_chunk in text_to_speech_pcm_stream(buffer):
|
|
yield ("audio", audio_chunk)
|
|
buffer = ""
|
|
|
|
# 处理剩余文本
|
|
if buffer.strip():
|
|
async for audio_chunk in text_to_speech_pcm_stream(buffer):
|
|
yield ("audio", audio_chunk)
|
|
|
|
|
|
async def text_to_voice(text: str) -> bytes:
|
|
"""
|
|
文本转语音(用于导航提示等)
|
|
|
|
Args:
|
|
text: 要合成的文本
|
|
|
|
Returns:
|
|
PCM16 音频数据
|
|
"""
|
|
return await text_to_speech_pcm(text)
|
|
|
|
|
|
async def text_to_voice_stream(text: str) -> AsyncGenerator[bytes, None]:
|
|
"""
|
|
流式文本转语音
|
|
|
|
Args:
|
|
text: 要合成的文本
|
|
|
|
Yields:
|
|
PCM16 音频块
|
|
"""
|
|
async for chunk in text_to_speech_pcm_stream(text):
|
|
yield chunk
|