Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion assemblyai/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.64.16"
__version__ = "0.64.20"
1 change: 1 addition & 0 deletions assemblyai/streaming/v3/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ class SpeechModel(str, Enum):
u3_rt_pro = "u3-rt-pro"
u3_rt_pro_beta_1 = "u3-rt-pro-beta-1"
whisper_rt = "whisper-rt"
universal_3_5_pro = "universal-3-5-pro"
u3_pro = "u3-pro" # Deprecated: Use u3_rt_pro instead

def __str__(self):
Expand Down
12 changes: 8 additions & 4 deletions assemblyai/sync_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ def _error_from_response(response: httpx.Response) -> types.SyncTranscriptError:
"""
Builds a `SyncTranscriptError` from a non-200 response.

The service uses two error envelopes: `{"error_code", "message"}` for
audio/capacity/inference errors and `{"detail"}` for auth and rate-limit
errors. Parse by status code, not by assuming `error_code` is present.
The service returns an RFC 9457 problem-details envelope
(`{"status", "title", "detail"}`); `error_code` is the snake_cased
`title` (e.g. `"Audio Too Large"` -> `audio_too_large`). Older envelopes
(`{"error_code", "message"}` and `{"detail"}`) are still accepted.
"""
error_code: Optional[str] = None
message: Optional[str] = None
Expand All @@ -24,7 +25,10 @@ def _error_from_response(response: httpx.Response) -> types.SyncTranscriptError:
body = response.json()
if isinstance(body, dict):
error_code = body.get("error_code")
message = body.get("message") or body.get("detail")
title = body.get("title")
if error_code is None and isinstance(title, str) and title:
error_code = title.lower().replace(" ", "_")
message = body.get("detail") or body.get("message")
except Exception:
message = response.text or None

Expand Down
47 changes: 28 additions & 19 deletions assemblyai/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,10 @@ class SyncTranscriptError(AssemblyAIError):
"""
Error raised when a synchronous transcription request fails.

Carries the server's machine-readable `error_code` (e.g. `bad_audio`,
`audio_too_large`, `capacity_exceeded`, `inference_timeout`) when present,
and `retry_after` (seconds) for 429/503 responses that include a
Carries a machine-readable `error_code` — the snake_cased problem-details
`title` from the server (e.g. `bad_audio`, `audio_too_large`,
`capacity_exceeded`, `inference_timeout`) — when present, and
`retry_after` (seconds) for 429/503 responses that include a
`Retry-After` header.
"""

Expand Down Expand Up @@ -2981,16 +2982,21 @@ class LemurPurgeResponse(BaseModel):
"The result of the LeMUR purge request"


# Caps mirror the sync service's `config` part so an oversized request fails
# locally with a clear message instead of a 400 round trip.
# Caps mirror the sync service's `config` part. `prompt` and `word_boost`
# over their caps are rejected; `conversation_context` over its caps is
# trimmed (oldest turns first), matching the server.
_SYNC_MAX_PROMPT_LEN = 4096
_SYNC_MAX_WORD_BOOST_LEN = 2048
_SYNC_MAX_CONVERSATION_CONTEXT_TURNS = 100
_SYNC_MAX_CONVERSATION_CONTEXT_LEN = 4096


def _normalize_conversation_context(v):
"""Coerce a single string to a one-turn list, strip + drop empties, cap.
"""Coerce a single string to a one-turn list, strip + drop empties, trim.

Context over the turn-count or character caps is trimmed by dropping the
oldest turns (front of the list) first — the same trim the server applies
— so the most recent turn is kept whenever it fits on its own.

Shared by the pydantic v1 and v2 validators on ``SyncTranscriptionConfig``.
"""
Expand All @@ -2999,17 +3005,13 @@ def _normalize_conversation_context(v):
if isinstance(v, str):
v = [v]
turns = [t.strip() for t in v if t and t.strip()]
if len(turns) > _SYNC_MAX_CONVERSATION_CONTEXT_TURNS:
raise ValueError(
f"conversation_context exceeds {_SYNC_MAX_CONVERSATION_CONTEXT_TURNS} "
f"turns (got {len(turns)})"
)
total = sum(len(t) for t in turns)
if total > _SYNC_MAX_CONVERSATION_CONTEXT_LEN:
raise ValueError(
f"conversation_context exceeds {_SYNC_MAX_CONVERSATION_CONTEXT_LEN} "
f"characters (got {total})"
)
while turns and (
len(turns) > _SYNC_MAX_CONVERSATION_CONTEXT_TURNS
or total > _SYNC_MAX_CONVERSATION_CONTEXT_LEN
):
total -= len(turns[0])
turns = turns[1:]
return turns or None


Expand Down Expand Up @@ -3045,9 +3047,11 @@ class SyncTranscriptionConfig(BaseModel):
audio so it transcribes the clip with better continuity and proper-noun
consistency. Include turns from either side of the conversation (e.g. a
voice agent's replies) as separate entries; entries carry no speaker labels.
A single string is accepted and treated as one turn. Max 100 turns / 4096
characters total; when the prompt exceeds the model token budget the oldest
turns are dropped first, so put the most recent turn last."""
A single string is accepted and treated as one turn. Capped at 100 turns /
4096 characters total — over-cap context is trimmed (oldest turns dropped
first), not rejected, and the oldest turns are likewise dropped first when
the prompt exceeds the model token budget, so put the most recent turn
last."""

language_code: Optional[Union[str, List[str]]] = None
"""ISO 639-1 language code, or a list of codes for multilingual audio (e.g.
Expand Down Expand Up @@ -3118,3 +3122,8 @@ class SyncTranscriptResponse(BaseModel):

session_id: str
"Server-generated UUID for this request. Record it to correlate with support."

request_time_ms: Optional[float] = None
"""End-to-end server-side request time in milliseconds: queue wait, auth,
multipart parse, decode, inference, and serialization. ``None`` when the
server predates the field."""
29 changes: 29 additions & 0 deletions tests/unit/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,35 @@ def mocked_websocket_connect(
assert "speech_model=whisper-rt" in actual_url


def test_client_connect_with_universal_3_5_pro(mocker: MockFixture):
actual_url = None

def mocked_websocket_connect(
url: str, additional_headers: dict, open_timeout: float
):
nonlocal actual_url
actual_url = url

mocker.patch(
"assemblyai.streaming.v3.client.websocket_connect",
new=mocked_websocket_connect,
)

_disable_rw_threads(mocker)

options = StreamingClientOptions(api_key="test", api_host="api.example.com")
client = StreamingClient(options)

params = StreamingParameters(
sample_rate=16000,
speech_model=SpeechModel.universal_3_5_pro,
)

client.connect(params)

assert "speech_model=universal-3-5-pro" in actual_url


def test_turn_event_with_speaker_label():
data = {
"type": "Turn",
Expand Down
105 changes: 95 additions & 10 deletions tests/unit/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"confidence": 0.92,
"audio_duration_ms": 400,
"session_id": "eb92c4ff-4bbb-429f-9b99-7279d7fe738f",
"request_time_ms": 243.7,
}


Expand All @@ -43,6 +44,24 @@ def test_transcribe_bytes_parses_response(httpx_mock: HTTPXMock):
assert result.words[0].start == 0
assert result.words[0].end == 200
assert result.words[1].text == "world"
assert result.request_time_ms == 243.7


def test_transcribe_parses_response_without_request_time(httpx_mock: HTTPXMock):
# Given a server response that predates the request_time_ms field
response = {k: v for k, v in _OK_RESPONSE.items() if k != "request_time_ms"}
httpx_mock.add_response(
url=TRANSCRIBE_URL,
method="POST",
status_code=httpx.codes.OK,
json=response,
)

# When transcribing
result = aai.SyncTranscriber().transcribe(b"RIFFfake-wav-bytes")

# Then request_time_ms is None instead of a parse failure
assert result.request_time_ms is None


def test_transcribe_sends_model_header_and_wav_part(httpx_mock: HTTPXMock):
Expand Down Expand Up @@ -124,11 +143,29 @@ def test_transcribe_coerces_conversation_context_string(httpx_mock: HTTPXMock):
assert b'"Sure, what date were you thinking?"' in body


def test_conversation_context_rejects_too_many_chars():
# Given conversation_context whose total length exceeds the cap,
# When/Then constructing the config raises a validation error
with pytest.raises(Exception):
aai.SyncTranscriptionConfig(conversation_context=["a" * 5000])
def test_conversation_context_trims_oldest_turns_over_char_cap():
# Given conversation_context whose total length exceeds the 4096-char cap
config = aai.SyncTranscriptionConfig(conversation_context=["a" * 3000, "b" * 3000])

# Then the oldest turn is dropped and the most recent turn is kept
assert config.conversation_context == ["b" * 3000]


def test_conversation_context_trims_oldest_turns_over_turn_cap():
# Given conversation_context with more than 100 turns
turns = [f"turn {i}" for i in range(120)]
config = aai.SyncTranscriptionConfig(conversation_context=turns)

# Then it is trimmed to the 100 most recent turns, oldest dropped first
assert config.conversation_context == turns[20:]


def test_conversation_context_empties_when_single_turn_over_char_cap():
# Given a single turn that alone exceeds the character cap
config = aai.SyncTranscriptionConfig(conversation_context=["a" * 5000])

# Then the context trims to nothing rather than raising
assert config.conversation_context is None


def test_transcribe_sends_single_language_code(httpx_mock: HTTPXMock):
Expand Down Expand Up @@ -226,8 +263,32 @@ def test_word_boost_too_long_raises():
aai.SyncTranscriptionConfig(word_boost=["x" * 3000])


def test_error_envelope_maps_to_sync_transcript_error(httpx_mock: HTTPXMock):
# Given the server rejects oversized audio
def test_problem_details_envelope_maps_to_sync_transcript_error(
httpx_mock: HTTPXMock,
):
# Given the server rejects oversized audio with a problem-details body
httpx_mock.add_response(
url=TRANSCRIBE_URL,
method="POST",
status_code=413,
json={"status": 413, "title": "Audio Too Large", "detail": "too long"},
)

# When transcribing, Then a SyncTranscriptError carries the snake_cased
# title as error_code, plus the status and detail
with pytest.raises(aai.SyncTranscriptError) as exc_info:
aai.SyncTranscriber().transcribe(b"RIFFfake-wav-bytes")

error = exc_info.value
assert error.status_code == 413
assert error.error_code == "audio_too_large"
assert "too long" in str(error)


def test_legacy_error_envelope_maps_to_sync_transcript_error(
httpx_mock: HTTPXMock,
):
# Given a server still on the pre-problem-details envelope
httpx_mock.add_response(
url=TRANSCRIBE_URL,
method="POST",
Expand All @@ -251,20 +312,44 @@ def test_rate_limit_surfaces_retry_after(httpx_mock: HTTPXMock):
url=TRANSCRIBE_URL,
method="POST",
status_code=429,
json={"detail": "Too many requests"},
json={
"status": 429,
"title": "Too Many Requests",
"detail": "Too many requests",
},
headers={"Retry-After": "5"},
)

# When transcribing, Then retry_after is parsed and error_code is absent
# When transcribing, Then retry_after and the snake_cased title are parsed
with pytest.raises(aai.SyncTranscriptError) as exc_info:
aai.SyncTranscriber().transcribe(b"RIFFfake-wav-bytes")

error = exc_info.value
assert error.status_code == 429
assert error.error_code is None
assert error.error_code == "too_many_requests"
assert error.retry_after == 5


def test_legacy_detail_only_envelope(httpx_mock: HTTPXMock):
# Given an auth-style body with only a detail field
httpx_mock.add_response(
url=TRANSCRIBE_URL,
method="POST",
status_code=401,
json={"detail": "Invalid API key"},
)

# When transcribing, Then the detail becomes the message and error_code
# stays absent
with pytest.raises(aai.SyncTranscriptError) as exc_info:
aai.SyncTranscriber().transcribe(b"RIFFfake-wav-bytes")

error = exc_info.value
assert error.status_code == 401
assert error.error_code is None
assert "Invalid API key" in str(error)


def test_default_model_is_u3_sync_pro():
# Given a default config
# When inspecting the model
Expand Down
Loading