203 lines
5.0 KiB
Python
203 lines
5.0 KiB
Python
# edge_tts_client.py
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
EdgeTTS 流式语音合成客户端 - Day 21
|
||
|
||
特点:
|
||
- 完全免费
|
||
- 流式输出(边合成边播放)
|
||
- 低延迟
|
||
"""
|
||
|
||
import os
|
||
import asyncio
|
||
import edge_tts
|
||
from typing import AsyncGenerator, Optional
|
||
|
||
# 默认语音
|
||
DEFAULT_VOICE = os.getenv("EDGE_TTS_VOICE", "zh-CN-XiaoxiaoNeural")
|
||
|
||
# 语速调整 ("+0%", "+10%", "-10%" 等)
|
||
DEFAULT_RATE = os.getenv("EDGE_TTS_RATE", "+0%")
|
||
|
||
# 音量调整
|
||
DEFAULT_VOLUME = os.getenv("EDGE_TTS_VOLUME", "+0%")
|
||
|
||
|
||
async def text_to_speech_stream(
|
||
text: str,
|
||
voice: str = DEFAULT_VOICE,
|
||
rate: str = DEFAULT_RATE,
|
||
volume: str = DEFAULT_VOLUME,
|
||
) -> AsyncGenerator[bytes, None]:
|
||
"""
|
||
流式文本转语音
|
||
|
||
Args:
|
||
text: 要合成的文本
|
||
voice: 语音名称
|
||
rate: 语速
|
||
volume: 音量
|
||
|
||
Yields:
|
||
MP3 音频数据块
|
||
"""
|
||
if not text or not text.strip():
|
||
return
|
||
|
||
try:
|
||
communicate = edge_tts.Communicate(
|
||
text=text,
|
||
voice=voice,
|
||
rate=rate,
|
||
volume=volume,
|
||
)
|
||
|
||
async for chunk in communicate.stream():
|
||
if chunk["type"] == "audio":
|
||
yield chunk["data"]
|
||
|
||
except Exception as e:
|
||
print(f"[EdgeTTS] 合成失败: {e}")
|
||
|
||
|
||
async def text_to_speech(
|
||
text: str,
|
||
voice: str = DEFAULT_VOICE,
|
||
rate: str = DEFAULT_RATE,
|
||
volume: str = DEFAULT_VOLUME,
|
||
) -> bytes:
|
||
"""
|
||
完整文本转语音(返回完整音频)
|
||
|
||
Args:
|
||
text: 要合成的文本
|
||
voice: 语音名称
|
||
rate: 语速
|
||
volume: 音量
|
||
|
||
Returns:
|
||
MP3 音频数据
|
||
"""
|
||
audio_chunks = []
|
||
async for chunk in text_to_speech_stream(text, voice, rate, volume):
|
||
audio_chunks.append(chunk)
|
||
return b"".join(audio_chunks)
|
||
|
||
|
||
async def text_to_speech_pcm(
|
||
text: str,
|
||
voice: str = DEFAULT_VOICE,
|
||
rate: str = DEFAULT_RATE,
|
||
target_sample_rate: int = 16000,
|
||
) -> bytes:
|
||
"""
|
||
文本转 PCM16 音频(用于直接播放)
|
||
|
||
Args:
|
||
text: 要合成的文本
|
||
voice: 语音名称
|
||
rate: 语速
|
||
target_sample_rate: 目标采样率
|
||
|
||
Returns:
|
||
PCM16 音频数据
|
||
"""
|
||
import io
|
||
from pydub import AudioSegment
|
||
|
||
# 获取 MP3 数据
|
||
mp3_data = await text_to_speech(text, voice, rate)
|
||
|
||
if not mp3_data:
|
||
return b""
|
||
|
||
try:
|
||
# MP3 -> PCM 转换
|
||
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
||
|
||
# 设置采样率和通道
|
||
audio = audio.set_frame_rate(target_sample_rate)
|
||
audio = audio.set_channels(1) # 单声道
|
||
audio = audio.set_sample_width(2) # 16-bit
|
||
|
||
return audio.raw_data
|
||
|
||
except Exception as e:
|
||
print(f"[EdgeTTS] PCM 转换失败: {e}")
|
||
return b""
|
||
|
||
|
||
async def text_to_speech_pcm_stream(
|
||
text: str,
|
||
voice: str = DEFAULT_VOICE,
|
||
rate: str = DEFAULT_RATE,
|
||
target_sample_rate: int = 16000,
|
||
) -> AsyncGenerator[bytes, None]:
|
||
"""
|
||
流式文本转 PCM16 音频
|
||
|
||
注意:由于需要解码 MP3,这里采用分段合成的方式
|
||
每遇到标点符号就合成一段
|
||
|
||
Args:
|
||
text: 要合成的文本
|
||
voice: 语音名称
|
||
rate: 语速
|
||
target_sample_rate: 目标采样率
|
||
|
||
Yields:
|
||
PCM16 音频数据块
|
||
"""
|
||
import io
|
||
from pydub import AudioSegment
|
||
|
||
# 按标点分割文本
|
||
punctuation = "。,!?;:,.!?;:"
|
||
segments = []
|
||
current = ""
|
||
|
||
for char in text:
|
||
current += char
|
||
if char in punctuation:
|
||
segments.append(current.strip())
|
||
current = ""
|
||
|
||
if current.strip():
|
||
segments.append(current.strip())
|
||
|
||
# 逐段合成
|
||
for segment in segments:
|
||
if not segment:
|
||
continue
|
||
|
||
try:
|
||
mp3_data = await text_to_speech(segment, voice, rate)
|
||
|
||
if mp3_data:
|
||
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
|
||
audio = audio.set_frame_rate(target_sample_rate)
|
||
audio = audio.set_channels(1)
|
||
audio = audio.set_sample_width(2)
|
||
|
||
yield audio.raw_data
|
||
|
||
except Exception as e:
|
||
print(f"[EdgeTTS] 分段合成失败: {e}")
|
||
|
||
|
||
# 语音列表(常用中文)
|
||
CHINESE_VOICES = [
|
||
"zh-CN-XiaoxiaoNeural", # 女声,自然
|
||
"zh-CN-YunxiNeural", # 男声,自然
|
||
"zh-CN-XiaoyiNeural", # 女声,活泼
|
||
"zh-CN-YunjianNeural", # 男声,播报
|
||
"zh-CN-XiaochenNeural", # 女声,温柔
|
||
]
|
||
|
||
|
||
async def list_voices() -> list:
|
||
"""列出所有可用语音"""
|
||
voices = await edge_tts.list_voices()
|
||
return [v for v in voices if v["Locale"].startswith("zh")]
|