diff --git a/src/agents/voice/result.py b/src/agents/voice/result.py index 511c8e6e7d..c41f3684dc 100644 --- a/src/agents/voice/result.py +++ b/src/agents/voice/result.py @@ -201,7 +201,7 @@ async def _add_text(self, text: str): combined_sentences, self._text_buffer = self.tts_settings.text_splitter(self._text_buffer) - if len(combined_sentences) >= 20: + if combined_sentences: local_queue: asyncio.Queue[VoiceStreamEvent | None] = asyncio.Queue() self._ordered_tasks.append(local_queue) self._tasks.append( @@ -220,6 +220,10 @@ async def _turn_done(self): ) ) self._text_buffer = "" + elif self._started_processing_turn: + local_queue = asyncio.Queue() + self._ordered_tasks.append(local_queue) + await local_queue.put(VoiceStreamEventLifecycle(event="turn_ended")) self._done_processing = True if self._dispatcher_task is None: self._dispatcher_task = asyncio.create_task(self._dispatch_audio()) diff --git a/tests/voice/test_pipeline.py b/tests/voice/test_pipeline.py index 7bc46279ad..f92a857dba 100644 --- a/tests/voice/test_pipeline.py +++ b/tests/voice/test_pipeline.py @@ -82,6 +82,64 @@ async def run(self, text: str, settings: TTSModelSettings): assert audio_chunks == [np.array([1], dtype=np.int16).tobytes()] +@pytest.mark.asyncio +async def test_streamed_audio_result_synthesizes_short_custom_splitter_chunk() -> None: + texts: list[str] = [] + + class RecordingTTS(FakeTTS): + async def run(self, text: str, settings: TTSModelSettings): + texts.append(text) + yield np.zeros(2, dtype=np.int16).tobytes() + + def split_immediately(text: str) -> tuple[str, str]: + return text, "" + + result = StreamedAudioResult( + RecordingTTS(), + TTSModelSettings(buffer_size=1, text_splitter=split_immediately), + VoicePipelineConfig(), + ) + + await result._add_text("ok") + await result._turn_done() + await result._done() + + events, audio_chunks = await extract_events(result) + + assert texts == ["ok"] + assert events == ["turn_started", "audio", "turn_ended", "session_ended"] + assert audio_chunks == [np.zeros(2, dtype=np.int16).tobytes()] + + +@pytest.mark.asyncio +async def test_streamed_audio_result_ignores_empty_custom_splitter_chunk() -> None: + texts: list[str] = [] + + class RecordingTTS(FakeTTS): + async def run(self, text: str, settings: TTSModelSettings): + texts.append(text) + yield np.zeros(2, dtype=np.int16).tobytes() + + def discard_text(_text: str) -> tuple[str, str]: + return "", "" + + result = StreamedAudioResult( + RecordingTTS(), + TTSModelSettings(buffer_size=1, text_splitter=discard_text), + VoicePipelineConfig(), + ) + + await result._add_text("ok") + await result._turn_done() + await result._done() + + events, audio_chunks = await extract_events(result) + + assert texts == [] + assert events == ["turn_started", "turn_ended", "session_ended"] + assert audio_chunks == [] + + @pytest.mark.asyncio async def test_voicepipeline_run_single_turn() -> None: # Single turn. Should produce a single audio output, which is the TTS output for "out_1".