fix: 修复流式播放无声音问题(SSE 行缓冲 + 解析层级)

This commit is contained in:
2026-05-09 04:06:47 +08:00
parent ceeb6a3c31
commit d34ebd147e
15 changed files with 679 additions and 458 deletions

View File

@@ -150,13 +150,15 @@ async fn run(cli: Cli) -> Result<()> {
// 守护进程模式(由 daemon start -d 自动调用)
daemon::start_daemon(port).await
}
Some(Commands::Send { text, voice, format, style, port }) => {
Some(Commands::Send { text, voice, format, style, stream, port }) => {
// 发送文本到守护进程
let stream_opt = if stream { Some(true) } else { None };
client::send_to_daemon(
&text,
voice.as_deref(),
format.as_deref(),
style.as_deref(),
stream_opt,
port,
)
.await
@@ -178,7 +180,21 @@ async fn run(cli: Cli) -> Result<()> {
));
}
// 执行语音合成
// 流式播放走独立路径:边下边播,不等待全量下载
if cli.stream && cli.play {
ui::show_playback_start();
handle_stream_play(
cli.text,
cli.file,
&cli.voice,
cli.style.as_deref(),
)
.await?;
ui::show_playback_complete();
return Ok(());
}
// 执行语音合成(非流式播放场景:全量下载后输出/播放)
let audio_data = synthesize(
cli.text,
cli.file,
@@ -191,15 +207,7 @@ async fn run(cli: Cli) -> Result<()> {
// 根据参数决定处理方式
if cli.play {
// 播放音频(流式数据需要封装成 WAV 格式)
ui::show_playback_start();
if cli.stream {
// 流式返回的是 PCM16 原始数据,需要添加 WAV 头
let wav_data = pcm16_to_wav(&audio_data);
play_audio(&wav_data)?;
} else {
play_audio(&audio_data)?;
}
play_audio(&audio_data)?;
ui::show_playback_complete();
} else if let Some(output_path) = cli.output {
// 保存到文件
@@ -488,6 +496,128 @@ fn play_audio(data: &[u8]) -> Result<()> {
Ok(())
}
/// 流式播放:边下载 PCM16 chunk 边播放
///
/// 先缓冲约 1 秒数据再开始播放,抗网络抖动
/// 每收到一块就解码为 i16 采样,追加到 rodio Sink 队列
async fn handle_stream_play(
text: Option<String>,
file: Option<std::path::PathBuf>,
voice: &str,
style: Option<&str>,
) -> Result<()> {
let content = if let Some(t) = text {
tone::apply_tone(&t)
} else if let Some(f) = file {
let mut file = fs::File::open(&f)
.with_context(|| format!("无法打开文件: {:?}", f))?;
let mut content = String::new();
file.read_to_string(&mut content)
.with_context(|| format!("无法读取文件: {:?}", f))?;
tone::apply_tone(&content)
} else {
return Err(anyhow::anyhow!("没有提供文本内容"));
};
let validated_voice = validate_voice(voice);
let config_manager = ConfigManager::new()
.context("无法加载配置")?;
let config = config_manager.get_config();
if config.api_key.is_empty() {
return Err(anyhow::anyhow!(
"API Key 未设置\n请使用: mimo-tts config set --api-key <YOUR_API_KEY>"
));
}
let client = api::TtsClient::builder()
.base_url(config.base_url.clone())
.api_key(config.api_key.clone())
.build()
.context("无法创建 TTS 客户端")?;
let mut builder = api::TtsRequest::builder()
.audio(api::AudioConfig {
format: "pcm16".to_string(),
voice: validated_voice,
});
if let Some(s) = style {
builder = builder.add_message(api::Message {
role: "user".to_string(),
content: s.to_string(),
});
}
builder = builder.add_message(api::Message {
role: "assistant".to_string(),
content,
});
builder = builder.stream(true);
let request = builder.build();
let (_stream, stream_handle) = rodio::OutputStream::try_default()
.context("无法创建音频输出流")?;
let sink = std::sync::Arc::new(rodio::Sink::try_new(&stream_handle)
.context("无法创建音频播放器")?);
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<Vec<u8>>();
// 接收任务:先缓冲 ~1 秒 PCM16 再开始播放,后续逐块追加
let play_sink = sink.clone();
let play_handle = tokio::spawn(async move {
let mut buffer = Vec::new();
// 24000Hz * 16bit * 1ch = 48000 字节/秒
let threshold = 48000;
let mut started = false;
while let Some(chunk) = rx.recv().await {
if !started {
buffer.extend_from_slice(&chunk);
if buffer.len() >= threshold {
let samples: Vec<i16> = buffer.chunks(2)
.filter(|c| c.len() == 2)
.map(|c| i16::from_le_bytes([c[0], c[1]]))
.collect();
if !samples.is_empty() {
play_sink.append(rodio::buffer::SamplesBuffer::new(1, 24000, samples));
}
buffer.clear();
started = true;
}
} else {
let samples: Vec<i16> = chunk.chunks(2)
.filter(|c| c.len() == 2)
.map(|c| i16::from_le_bytes([c[0], c[1]]))
.collect();
if !samples.is_empty() {
play_sink.append(rodio::buffer::SamplesBuffer::new(1, 24000, samples));
}
}
}
// 刷新剩余缓冲(文本较短下次未达到阈值)
if !buffer.is_empty() {
let samples: Vec<i16> = buffer.chunks(2)
.filter(|c| c.len() == 2)
.map(|c| i16::from_le_bytes([c[0], c[1]]))
.collect();
if !samples.is_empty() {
play_sink.append(rodio::buffer::SamplesBuffer::new(1, 24000, samples));
}
}
});
client.synthesize_stream_to_channel(&request, tx).await?;
play_handle.await.context("流式播放任务异常")?;
sink.sleep_until_end();
Ok(())
}
/// 将 PCM16 原始数据转换为 WAV 格式
///
/// # 参数
@@ -495,6 +625,7 @@ fn play_audio(data: &[u8]) -> Result<()> {
///
/// # 返回
/// 完整的 WAV 格式数据(包含 44 字节头部)
#[allow(dead_code)]
fn pcm16_to_wav(pcm_data: &[u8]) -> Vec<u8> {
let sample_rate: u32 = 24000; // Mimo-TTS PCM16 输出通常是 24kHz
let bits_per_sample: u16 = 16;