# edge_tts_client.py # -*- coding: utf-8 -*- """ EdgeTTS 流式语音合成客户端 - Day 21 特点: - 完全免费 - 流式输出(边合成边播放) - 低延迟 """ import os import asyncio import edge_tts from typing import AsyncGenerator, Optional # 默认语音 DEFAULT_VOICE = os.getenv("EDGE_TTS_VOICE", "zh-CN-XiaoxiaoNeural") # 语速调整 ("+0%", "+10%", "-10%" 等) DEFAULT_RATE = os.getenv("EDGE_TTS_RATE", "+0%") # 音量调整 DEFAULT_VOLUME = os.getenv("EDGE_TTS_VOLUME", "+0%") async def text_to_speech_stream( text: str, voice: str = DEFAULT_VOICE, rate: str = DEFAULT_RATE, volume: str = DEFAULT_VOLUME, ) -> AsyncGenerator[bytes, None]: """ 流式文本转语音 Args: text: 要合成的文本 voice: 语音名称 rate: 语速 volume: 音量 Yields: MP3 音频数据块 """ if not text or not text.strip(): return try: communicate = edge_tts.Communicate( text=text, voice=voice, rate=rate, volume=volume, ) async for chunk in communicate.stream(): if chunk["type"] == "audio": yield chunk["data"] except Exception as e: print(f"[EdgeTTS] 合成失败: {e}") async def text_to_speech( text: str, voice: str = DEFAULT_VOICE, rate: str = DEFAULT_RATE, volume: str = DEFAULT_VOLUME, ) -> bytes: """ 完整文本转语音(返回完整音频) Args: text: 要合成的文本 voice: 语音名称 rate: 语速 volume: 音量 Returns: MP3 音频数据 """ audio_chunks = [] async for chunk in text_to_speech_stream(text, voice, rate, volume): audio_chunks.append(chunk) return b"".join(audio_chunks) async def text_to_speech_pcm( text: str, voice: str = DEFAULT_VOICE, rate: str = DEFAULT_RATE, target_sample_rate: int = 16000, ) -> bytes: """ 文本转 PCM16 音频(用于直接播放) Args: text: 要合成的文本 voice: 语音名称 rate: 语速 target_sample_rate: 目标采样率 Returns: PCM16 音频数据 """ import io from pydub import AudioSegment # 获取 MP3 数据 mp3_data = await text_to_speech(text, voice, rate) if not mp3_data: return b"" try: # MP3 -> PCM 转换 audio = AudioSegment.from_mp3(io.BytesIO(mp3_data)) # 设置采样率和通道 audio = audio.set_frame_rate(target_sample_rate) audio = audio.set_channels(1) # 单声道 audio = audio.set_sample_width(2) # 16-bit return audio.raw_data except Exception as e: print(f"[EdgeTTS] PCM 转换失败: {e}") return b"" async def text_to_speech_pcm_stream( text: str, voice: str = DEFAULT_VOICE, rate: str = DEFAULT_RATE, target_sample_rate: int = 16000, ) -> AsyncGenerator[bytes, None]: """ 流式文本转 PCM16 音频 注意:由于需要解码 MP3,这里采用分段合成的方式 每遇到标点符号就合成一段 Args: text: 要合成的文本 voice: 语音名称 rate: 语速 target_sample_rate: 目标采样率 Yields: PCM16 音频数据块 """ import io from pydub import AudioSegment # 按标点分割文本 punctuation = "。,!?;:,.!?;:" segments = [] current = "" for char in text: current += char if char in punctuation: segments.append(current.strip()) current = "" if current.strip(): segments.append(current.strip()) # 逐段合成 for segment in segments: if not segment: continue try: mp3_data = await text_to_speech(segment, voice, rate) if mp3_data: audio = AudioSegment.from_mp3(io.BytesIO(mp3_data)) audio = audio.set_frame_rate(target_sample_rate) audio = audio.set_channels(1) audio = audio.set_sample_width(2) yield audio.raw_data except Exception as e: print(f"[EdgeTTS] 分段合成失败: {e}") # 语音列表(常用中文) CHINESE_VOICES = [ "zh-CN-XiaoxiaoNeural", # 女声,自然 "zh-CN-YunxiNeural", # 男声,自然 "zh-CN-XiaoyiNeural", # 女声,活泼 "zh-CN-YunjianNeural", # 男声,播报 "zh-CN-XiaochenNeural", # 女声,温柔 ] async def list_voices() -> list: """列出所有可用语音""" voices = await edge_tts.list_voices() return [v for v in voices if v["Locale"].startswith("zh")]