Files
NaviGlassServer/edge_tts_client.py
2025-12-31 15:42:30 +08:00

203 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# edge_tts_client.py
# -*- coding: utf-8 -*-
"""
EdgeTTS 流式语音合成客户端 - Day 21
特点:
- 完全免费
- 流式输出(边合成边播放)
- 低延迟
"""
import os
import asyncio
import edge_tts
from typing import AsyncGenerator, Optional
# 默认语音
DEFAULT_VOICE = os.getenv("EDGE_TTS_VOICE", "zh-CN-XiaoxiaoNeural")
# 语速调整 ("+0%", "+10%", "-10%" 等)
DEFAULT_RATE = os.getenv("EDGE_TTS_RATE", "+0%")
# 音量调整
DEFAULT_VOLUME = os.getenv("EDGE_TTS_VOLUME", "+0%")
async def text_to_speech_stream(
text: str,
voice: str = DEFAULT_VOICE,
rate: str = DEFAULT_RATE,
volume: str = DEFAULT_VOLUME,
) -> AsyncGenerator[bytes, None]:
"""
流式文本转语音
Args:
text: 要合成的文本
voice: 语音名称
rate: 语速
volume: 音量
Yields:
MP3 音频数据块
"""
if not text or not text.strip():
return
try:
communicate = edge_tts.Communicate(
text=text,
voice=voice,
rate=rate,
volume=volume,
)
async for chunk in communicate.stream():
if chunk["type"] == "audio":
yield chunk["data"]
except Exception as e:
print(f"[EdgeTTS] 合成失败: {e}")
async def text_to_speech(
text: str,
voice: str = DEFAULT_VOICE,
rate: str = DEFAULT_RATE,
volume: str = DEFAULT_VOLUME,
) -> bytes:
"""
完整文本转语音(返回完整音频)
Args:
text: 要合成的文本
voice: 语音名称
rate: 语速
volume: 音量
Returns:
MP3 音频数据
"""
audio_chunks = []
async for chunk in text_to_speech_stream(text, voice, rate, volume):
audio_chunks.append(chunk)
return b"".join(audio_chunks)
async def text_to_speech_pcm(
text: str,
voice: str = DEFAULT_VOICE,
rate: str = DEFAULT_RATE,
target_sample_rate: int = 16000,
) -> bytes:
"""
文本转 PCM16 音频(用于直接播放)
Args:
text: 要合成的文本
voice: 语音名称
rate: 语速
target_sample_rate: 目标采样率
Returns:
PCM16 音频数据
"""
import io
from pydub import AudioSegment
# 获取 MP3 数据
mp3_data = await text_to_speech(text, voice, rate)
if not mp3_data:
return b""
try:
# MP3 -> PCM 转换
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
# 设置采样率和通道
audio = audio.set_frame_rate(target_sample_rate)
audio = audio.set_channels(1) # 单声道
audio = audio.set_sample_width(2) # 16-bit
return audio.raw_data
except Exception as e:
print(f"[EdgeTTS] PCM 转换失败: {e}")
return b""
async def text_to_speech_pcm_stream(
text: str,
voice: str = DEFAULT_VOICE,
rate: str = DEFAULT_RATE,
target_sample_rate: int = 16000,
) -> AsyncGenerator[bytes, None]:
"""
流式文本转 PCM16 音频
注意:由于需要解码 MP3这里采用分段合成的方式
每遇到标点符号就合成一段
Args:
text: 要合成的文本
voice: 语音名称
rate: 语速
target_sample_rate: 目标采样率
Yields:
PCM16 音频数据块
"""
import io
from pydub import AudioSegment
# 按标点分割文本
punctuation = "。,!?;:,.!?;:"
segments = []
current = ""
for char in text:
current += char
if char in punctuation:
segments.append(current.strip())
current = ""
if current.strip():
segments.append(current.strip())
# 逐段合成
for segment in segments:
if not segment:
continue
try:
mp3_data = await text_to_speech(segment, voice, rate)
if mp3_data:
audio = AudioSegment.from_mp3(io.BytesIO(mp3_data))
audio = audio.set_frame_rate(target_sample_rate)
audio = audio.set_channels(1)
audio = audio.set_sample_width(2)
yield audio.raw_data
except Exception as e:
print(f"[EdgeTTS] 分段合成失败: {e}")
# 语音列表(常用中文)
CHINESE_VOICES = [
"zh-CN-XiaoxiaoNeural", # 女声,自然
"zh-CN-YunxiNeural", # 男声,自然
"zh-CN-XiaoyiNeural", # 女声,活泼
"zh-CN-YunjianNeural", # 男声,播报
"zh-CN-XiaochenNeural", # 女声,温柔
]
async def list_voices() -> list:
"""列出所有可用语音"""
voices = await edge_tts.list_voices()
return [v for v in voices if v["Locale"].startswith("zh")]