Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,18 @@
public class VoiceLiveAudioFormatTests extends VoiceLiveTestBase {

static Stream<Arguments> modelAndSamplingRateProvider() {
return withApiVersions(Stream.of(Arguments.of("gpt-4o-realtime", 16000), Arguments.of("gpt-4o-realtime", 44100),
Arguments.of("gpt-4o-realtime", 8000), Arguments.of("gpt-4o", 16000), Arguments.of("gpt-4o", 44100),
return withApiVersions(Stream.of(Arguments.of("gpt-realtime", 16000), Arguments.of("gpt-realtime", 44100),
Arguments.of("gpt-realtime", 8000), Arguments.of("gpt-4o", 16000), Arguments.of("gpt-4o", 44100),
Arguments.of("gpt-4.1", 8000)), API_VERSION_GA, API_VERSION_PREVIEW);
}

static Stream<Arguments> modelAndInputAudioFormatProvider() {
return withApiVersions(Stream.of(Arguments.of("gpt-4o", "g711_ulaw", "azure_semantic_vad"),
Arguments.of("gpt-4o", "g711_alaw", "azure_semantic_vad"),
Arguments.of("gpt-4o-realtime-preview", "g711_ulaw", "azure_semantic_vad"),
Arguments.of("gpt-4o-realtime-preview", "g711_ulaw", "server_vad"),
Arguments.of("gpt-4o-realtime-preview", "g711_alaw", "azure_semantic_vad"),
Arguments.of("gpt-4o-realtime-preview", "g711_alaw", "server_vad")));
Arguments.of("gpt-realtime", "g711_ulaw", "azure_semantic_vad"),
Arguments.of("gpt-realtime", "g711_ulaw", "server_vad"),
Arguments.of("gpt-realtime", "g711_alaw", "azure_semantic_vad"),
Arguments.of("gpt-realtime", "g711_alaw", "server_vad")));
}

static Stream<Arguments> modelAndOutputAudioFormatAzureVoiceProvider() {
Expand All @@ -67,8 +67,8 @@ static Stream<Arguments> modelAndOutputAudioFormatAzureVoiceProvider() {
}

static Stream<Arguments> modelAndOutputAudioFormatOpenAIVoiceProvider() {
return withApiVersions(Stream.of(Arguments.of("gpt-4o-realtime", "pcm16"),
Arguments.of("gpt-4o-realtime", "g711_ulaw"), Arguments.of("gpt-4o-realtime", "g711_alaw")));
return withApiVersions(Stream.of(Arguments.of("gpt-realtime", "pcm16"),
Arguments.of("gpt-realtime", "g711_ulaw"), Arguments.of("gpt-realtime", "g711_alaw")));
}

@ParameterizedTest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
public class VoiceLiveAudioTests extends VoiceLiveTestBase {

static Stream<Arguments> audioParams() {
return crossProduct(new String[] { "gpt-4o-realtime-preview", "gpt-4.1" },
return crossProduct(new String[] { "gpt-realtime", "gpt-4.1" },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

Expand Down Expand Up @@ -108,7 +108,7 @@ public void testRealtimeServiceWithAudio(String model, String apiVersion) throws
}

static Stream<Arguments> audioEnhancementsParams() {
return crossProduct(new String[] { "gpt-4o-realtime-preview", "gpt-4.1" },
return crossProduct(new String[] { "gpt-realtime", "gpt-4.1" },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

Expand Down Expand Up @@ -183,77 +183,7 @@ public void testRealtimeServiceWithAudioEnhancements(String model, String apiVer
}

static Stream<Arguments> echoCancellationParams() {
return crossProduct(new String[] { "gpt-4o-realtime-preview", "gpt-4.1" },
return crossProduct(new String[] { "gpt-realtime", "gpt-4.1" },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

@ParameterizedTest
@MethodSource("echoCancellationParams")
@LiveOnly
public void testRealtimeServiceWithEchoCancellation(String model, String apiVersion)
throws InterruptedException, IOException {
VoiceLiveAsyncClient client = createClient(apiVersion);

byte[] audioData = loadAudioFile("4-1.wav");

AtomicInteger speechStartedEvents = new AtomicInteger(0);
AtomicInteger audioResponseBytes = new AtomicInteger(0);
CountDownLatch responseLatch = new CountDownLatch(1);

VoiceLiveSessionAsyncClient session = null;
Disposable subscription = null;
try {
VoiceLiveSessionOptions sessionOptions
= new VoiceLiveSessionOptions().setInputAudioTranscription(getSpeechRecognitionSetting(model))
.setInputAudioEchoCancellation(new AudioEchoCancellation());

session = client.startSession(model).block(SESSION_TIMEOUT);

Assertions.assertNotNull(session, "Session should be created successfully");

subscription = session.receiveEvents().subscribe(event -> {
ServerEventType eventType = event.getType();

if (eventType == ServerEventType.INPUT_AUDIO_BUFFER_SPEECH_STARTED) {
speechStartedEvents.incrementAndGet();
} else if (eventType == ServerEventType.RESPONSE_AUDIO_DELTA) {
if (event instanceof SessionUpdateResponseAudioDelta) {
SessionUpdateResponseAudioDelta audioDelta = (SessionUpdateResponseAudioDelta) event;
if (audioDelta.getDelta() != null) {
audioResponseBytes.addAndGet(audioDelta.getDelta().length);
}
}
responseLatch.countDown();
} else if (eventType == ServerEventType.ERROR) {
handleError(event);
responseLatch.countDown();
}
}, error -> {
System.err.println("Error receiving events: " + error.getMessage());
responseLatch.countDown();
});

waitForSetup();

ClientEventSessionUpdate updateEvent = new ClientEventSessionUpdate(sessionOptions);
session.sendEvent(updateEvent).block(SEND_TIMEOUT);

waitForSetup();

session.sendInputAudio(audioData).block(SEND_TIMEOUT);
session.sendInputAudio(getTrailingSilenceBytes()).block(SEND_TIMEOUT);

boolean received = responseLatch.await(EVENT_TIMEOUT_SECONDS, TimeUnit.SECONDS);

Assertions.assertTrue(received, "Should receive response within timeout");
Assertions.assertTrue(speechStartedEvents.get() > 1,
"Expected more than 1 speech segment, got " + speechStartedEvents.get());
Assertions.assertTrue(audioResponseBytes.get() > 0, "Audio bytes should be greater than 0");
} finally {
if (subscription != null) {
subscription.dispose();
}
closeSession(session);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
public class VoiceLiveConversationTests extends VoiceLiveTestBase {

static Stream<Arguments> retrieveItemParams() {
return crossProduct(new String[] { "gpt-4o-realtime" }, new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
return crossProduct(new String[] { "gpt-realtime" }, new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

@ParameterizedTest
Expand Down Expand Up @@ -132,7 +132,7 @@ public void testRealtimeServiceRetrieveItem(String model, String apiVersion)
}

static Stream<Arguments> truncateItemParams() {
return crossProduct(new String[] { "gpt-4o-realtime" }, new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
return crossProduct(new String[] { "gpt-realtime" }, new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

@ParameterizedTest
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ public abstract class VoiceLiveTestBase extends TestProxyTestBase {

// Model constants
protected static final String MODEL_GPT_4O = "gpt-4o";
protected static final String MODEL_GPT_4O_REALTIME = "gpt-4o-realtime";
protected static final String MODEL_GPT_4O_REALTIME_PREVIEW = "gpt-4o-realtime-preview";
protected static final String MODEL_GPT_4O_REALTIME_PREVIEW_2025_06_03 = "gpt-4o-realtime-preview-2025-06-03";
protected static final String MODEL_GPT_REALTIME = "gpt-realtime";
protected static final String MODEL_GPT_41 = "gpt-4.1";
protected static final String MODEL_GPT_5 = "gpt-5";
protected static final String MODEL_GPT_5_CHAT = "gpt-5-chat";
Expand All @@ -51,7 +49,7 @@ public abstract class VoiceLiveTestBase extends TestProxyTestBase {

// Default models for non-parameterized tests
protected static final String TEST_MODEL = MODEL_GPT_4O;
protected static final String TEST_MODEL_REALTIME = MODEL_GPT_4O_REALTIME_PREVIEW;
protected static final String TEST_MODEL_REALTIME = MODEL_GPT_REALTIME;

// Timeout constants
protected static final Duration SESSION_TIMEOUT = Duration.ofSeconds(30);
Expand Down Expand Up @@ -175,10 +173,9 @@ protected void handleError(SessionUpdate event) {
}

protected AudioInputTranscriptionOptions getSpeechRecognitionSetting(String model) {
AudioInputTranscriptionOptionsModel transcriptionModel
= model.startsWith("gpt-4o-realtime") || model.startsWith("gpt-4o-mini-realtime")
? AudioInputTranscriptionOptionsModel.WHISPER_1
: AudioInputTranscriptionOptionsModel.AZURE_SPEECH;
AudioInputTranscriptionOptionsModel transcriptionModel = model.startsWith("gpt-realtime")
? AudioInputTranscriptionOptionsModel.WHISPER_1
: AudioInputTranscriptionOptionsModel.AZURE_SPEECH;
return new AudioInputTranscriptionOptions(transcriptionModel).setLanguage("en-US");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ public class VoiceLiveToolCallTests extends VoiceLiveTestBase {
private static final String API_VERSION_2025_05_01_PREVIEW = "2025-05-01-preview";

// ===== test_realtime_service_tool_call =====
// Python: models=[gpt-4o-realtime, gpt-4o], api_versions=[2025-10-01, 2026-01-01-preview]
// Python: models=[gpt-realtime, gpt-4o], api_versions=[2025-10-01, 2026-01-01-preview]
// Uses _get_speech_recognition_setting(model), audio=4-1.wav, tool=assess_pronunciation
// Voice: AzureStandardVoice("en-US-AriaNeural")

static Stream<Arguments> toolCallParams() {
return crossProduct(new String[] { MODEL_GPT_4O_REALTIME, MODEL_GPT_4O },
return crossProduct(new String[] { MODEL_GPT_REALTIME, MODEL_GPT_4O },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
Comment thread
xitzhang marked this conversation as resolved.
}

Expand Down Expand Up @@ -92,7 +92,7 @@ private void doTestRealtimeServiceToolCall(String model, String apiVersion)
List<SessionUpdateResponseFunctionCallArgumentsDelta> functionCallResults = new ArrayList<>();
CountDownLatch firstDeltaLatch = new CountDownLatch(1);
// Track response completions so we can re-issue response.create() if VAD
// triggered a non-tool-call response first (gpt-4o-realtime race condition).
// triggered a non-tool-call response first (gpt-realtime race condition).
CountDownLatch responseDoneLatch = new CountDownLatch(1);

VoiceLiveSessionAsyncClient session = null;
Expand Down Expand Up @@ -136,7 +136,7 @@ private void doTestRealtimeServiceToolCall(String model, String apiVersion)
session.sendEvent(new ClientEventSessionUpdate(sessionOptions)).block(SEND_TIMEOUT);

// Send audio and response.create() in tight succession to beat server VAD.
// With gpt-4o-realtime, the default server VAD detects speech, auto-commits the
// With gpt-realtime, the default server VAD detects speech, auto-commits the
// buffer and triggers its own response before a delayed response.create() arrives.
session.sendInputAudio(audioData)
.then(session.sendEvent(new ClientEventResponseCreate()))
Expand Down Expand Up @@ -406,7 +406,7 @@ public void testRealtimeServiceToolCallParameter(String model, String apiVersion
// Uses azure-speech + ServerVad, audio=ask_weather.wav

static Stream<Arguments> liveSessionUpdateParams() {
return crossProduct(new String[] { MODEL_GPT_4O_REALTIME },
return crossProduct(new String[] { MODEL_GPT_REALTIME },
new String[] { API_VERSION_2025_05_01_PREVIEW, API_VERSION_PREVIEW });
}

Expand Down Expand Up @@ -569,7 +569,6 @@ public void testRealtimeServiceLiveSessionUpdate(String model, String apiVersion
// Python: @pytest.mark.skip() - skipped in Python tests

static Stream<Arguments> toolCallNoAudioOverlapParams() {
return crossProduct(new String[] { MODEL_GPT_4O_REALTIME },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
return crossProduct(new String[] { MODEL_GPT_REALTIME }, new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
public class VoiceLiveTranscriptionTests extends VoiceLiveTestBase {

static Stream<Arguments> whisperTranscriptionParams() {
return crossProduct(new String[] { "gpt-4o-realtime-preview", "gpt-4.1" },
return crossProduct(new String[] { "gpt-realtime", "gpt-4.1" },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

Expand Down Expand Up @@ -117,7 +117,7 @@ static Stream<Arguments> gpt4oTranscribeParams() {
@LiveOnly
public void testInputAudioTranscriptionWithGpt4oTranscribe(String transcriptionModel, String apiVersion)
throws InterruptedException, IOException {
String model = "gpt-4o-realtime-preview";
String model = "gpt-realtime";
VoiceLiveAsyncClient client = createClient(apiVersion);

byte[] audioData = loadAudioFile("largest_lake.wav");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@
public class VoiceLiveTurnDetectionTests extends VoiceLiveTestBase {

// ===== test_realtime_service_with_turn_detection_long_tts_vad_duration =====
// Python: models=[gpt-4o-realtime-preview, gpt-4o], api_versions=[2025-10-01, 2026-01-01-preview]
// Python: models=[gpt-realtime, gpt-4o], api_versions=[2025-10-01, 2026-01-01-preview]
// turn_detection: {"type": "azure_semantic_vad", "speech_duration_assistant_speaking_ms": 800}
// Note: speechDurationAssistantSpeakingMs not available in Java SDK;
// using speechDurationMs(800) as the closest available parameter.

static Stream<Arguments> longTtsVadDurationParams() {
return crossProduct(new String[] { MODEL_GPT_4O_REALTIME_PREVIEW, MODEL_GPT_4O },
return crossProduct(new String[] { MODEL_GPT_REALTIME, MODEL_GPT_4O },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

Expand Down Expand Up @@ -135,8 +135,7 @@ private void doTestLongTtsVadDuration(String model, String apiVersion) throws In

static Stream<Arguments> multilingualParams() {
return withApiVersions(Stream.of(
Arguments.of("gpt-4o-realtime-preview, default", MODEL_GPT_4O_REALTIME_PREVIEW,
new AzureSemanticVadTurnDetectionMultilingual()),
Arguments.of("gpt-realtime, default", MODEL_GPT_REALTIME, new AzureSemanticVadTurnDetectionMultilingual()),
Arguments.of("gpt-4o, default", MODEL_GPT_4O, new AzureSemanticVadTurnDetectionMultilingual()),
Arguments.of("gpt-4o, speechDuration200", MODEL_GPT_4O,
new AzureSemanticVadTurnDetectionMultilingual().setSpeechDurationMs(200)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
public class VoiceLiveVoicePropertiesTests extends VoiceLiveTestBase {

static Stream<Arguments> voicePropertiesParams() {
return crossProduct(new String[] { "gpt-4o-realtime", "gpt-4.1" },
return crossProduct(new String[] { "gpt-realtime", "gpt-4.1" },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

Expand Down Expand Up @@ -110,7 +110,7 @@ public void testRealtimeServiceWithVoiceProperties(String model, String apiVersi
}

static Stream<Arguments> audioTimestampAndVisemeParams() {
return crossProduct(new String[] { "gpt-4o-realtime-preview", "gpt-4.1" },
return crossProduct(new String[] { "gpt-realtime", "gpt-4.1" },
new String[] { API_VERSION_GA, API_VERSION_PREVIEW });
}

Expand Down
Loading