NaviGlassServer/ai_voice_pipeline.py

# ai_voice_pipeline.py
# -*- coding: utf-8 -*-
"""
AI 语音交互管道 - Day 21

整合 SenseVoice + GLM-4.5-Flash + EdgeTTS

流程：
1. 客户端 VAD 检测语音结束
2. 发送完整音频到服务器
3. SenseVoice 识别 → GLM 生成回复 → EdgeTTS 合成语音
4. 流式返回 PCM 音频
"""

import asyncio
from typing import Optional, Callable, AsyncGenerator

# 导入各模块
from sensevoice_asr import recognize as asr_recognize, init_sensevoice
from glm_client import chat as llm_chat, chat_stream as llm_chat_stream
from edge_tts_client import (
    text_to_speech_pcm_stream,
    text_to_speech_pcm,
    DEFAULT_VOICE,
)


async def init_pipeline():
    """初始化 AI 管道（服务器启动时调用）"""
    await init_sensevoice()
    print("[AI Pipeline] 初始化完成")


async def process_voice(
    pcm_audio: bytes,
    image_base64: Optional[str] = None,
    on_text: Optional[Callable[[str], None]] = None,
    on_audio: Optional[Callable[[bytes], None]] = None,
) -> str:
    """
    处理语音输入，返回 AI 回复

    Args:
        pcm_audio: PCM16 音频数据 (16kHz, mono)
        image_base64: 可选的图片（用于多模态）
        on_text: 文本回调（用于 UI 显示）
        on_audio: 音频回调（用于流式播放）

    Returns:
        AI 回复文本
    """
    # 1. 语音识别
    user_text = await asr_recognize(pcm_audio)

    if not user_text:
        print("[AI Pipeline] 未识别到有效语音")
        return ""

    print(f"[AI Pipeline] 用户说: {user_text}")

    # 通知 UI
    if on_text:
        on_text(f"用户: {user_text}")

    # 2. LLM 生成回复
    ai_response = await llm_chat(user_text, image_base64)

    if not ai_response:
        print("[AI Pipeline] AI 无回复")
        return ""

    print(f"[AI Pipeline] AI 回复: {ai_response}")

    # 通知 UI
    if on_text:
        on_text(f"AI: {ai_response}")

    # 3. TTS 合成并播放
    if on_audio:
        async for audio_chunk in text_to_speech_pcm_stream(ai_response):
            on_audio(audio_chunk)

    return ai_response


async def process_voice_stream(
    pcm_audio: bytes,
    image_base64: Optional[str] = None,
) -> AsyncGenerator[tuple, None]:
    """
    流式处理语音输入

    Args:
        pcm_audio: PCM16 音频数据
        image_base64: 可选的图片

    Yields:
        ("text", str) - 文本片段
        ("audio", bytes) - 音频片段
    """
    # 1. 语音识别
    user_text = await asr_recognize(pcm_audio)

    if not user_text:
        return

    yield ("user_text", user_text)

    # 2. LLM 流式生成 + 3. TTS 流式合成
    # 收集一定长度的文本后送 TTS
    buffer = ""
    punctuation = "。，！？；：,.!?;:"

    async for text_chunk in llm_chat_stream(user_text, image_base64):
        yield ("ai_text", text_chunk)
        buffer += text_chunk

        # 遇到标点时合成音频
        if buffer and buffer[-1] in punctuation:
            async for audio_chunk in text_to_speech_pcm_stream(buffer):
                yield ("audio", audio_chunk)
            buffer = ""

    # 处理剩余文本
    if buffer.strip():
        async for audio_chunk in text_to_speech_pcm_stream(buffer):
            yield ("audio", audio_chunk)


async def text_to_voice(text: str) -> bytes:
    """
    文本转语音（用于导航提示等）

    Args:
        text: 要合成的文本

    Returns:
        PCM16 音频数据
    """
    return await text_to_speech_pcm(text)


async def text_to_voice_stream(text: str) -> AsyncGenerator[bytes, None]:
    """
    流式文本转语音

    Args:
        text: 要合成的文本

    Yields:
        PCM16 音频块
    """
    async for chunk in text_to_speech_pcm_stream(text):
        yield chunk