diff --git a/.changeset/modality-aware-instructions.md b/.changeset/modality-aware-instructions.md new file mode 100644 index 000000000..17de7df2a --- /dev/null +++ b/.changeset/modality-aware-instructions.md @@ -0,0 +1,12 @@ +--- +'@livekit/agents': patch +--- + +feat(agents): add modality-aware `Instructions` with audio/text variants + +Introduce a new `Instructions` class for system prompts that adapt to the +user's input modality. The pipeline now applies the matching variant before +each LLM turn based on `SpeechHandle.inputDetails.modality`, and +`AgentSession.generateReply()` and `AgentSession.run()` expose an +`inputModality` option. `Instructions.tpl` supports JS-native prompt +composition while preserving audio/text variants. diff --git a/agents/src/llm/chat_context.test.ts b/agents/src/llm/chat_context.test.ts index 21f93620a..849469e55 100644 --- a/agents/src/llm/chat_context.test.ts +++ b/agents/src/llm/chat_context.test.ts @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 import { describe, expect, it } from 'vitest'; import { initializeLogger } from '../log.js'; +import { INSTRUCTIONS_MESSAGE_ID, applyInstructionsModality } from '../voice/generation.js'; import { FakeLLM } from '../voice/testing/fake_llm.js'; import { type AudioContent, @@ -12,7 +13,11 @@ import { FunctionCall, FunctionCallOutput, type ImageContent, + Instructions, ReadonlyChatContext, + concatInstructions, + isInstructions, + renderInstructions, } from './chat_context.js'; initializeLogger({ pretty: false, level: 'error' }); @@ -434,7 +439,7 @@ describe('ChatContext._summarize', () => { item.extra?.is_summary !== true, ); expect(rawTailMessages).toHaveLength(4); - expect(rawTailMessages.map((item) => item.textContent)).toEqual([ + expect(rawTailMessages.map((item) => (item as ChatMessage).textContent)).toEqual([ 'Order #123', 'Found your order. Let me check the warranty.', 'Thanks.', @@ -1235,3 +1240,242 @@ describe('ChatContext.isEquivalent', () => { }); }); }); + +describe('Instructions', () => { + it('constructs from an object with audio and text variants', () => { + const instr = new Instructions({ audio: 'audio variant', text: 'text variant' }); + + expect(instr.audio).toBe('audio variant'); + expect(instr.text).toBe('text variant'); + expect(instr.value).toBe('audio variant'); + }); + + it('identifies Instructions with a type guard', () => { + const instr = new Instructions({ audio: 'audio variant', text: 'text variant' }); + + expect(isInstructions(instr)).toBe(true); + expect(isInstructions('audio variant')).toBe(false); + expect(isInstructions({ type: 'instructions', audio: 'audio variant' })).toBe(false); + }); + + it('tpl propagates Instructions interpolations into audio and text variants', () => { + const instr = Instructions.tpl`persona +${new Instructions({ audio: 'audio rules', text: 'text rules' })} +extra`; + + expect(instr).toBeInstanceOf(Instructions); + expect(instr.audio).toBe('persona\naudio rules\nextra'); + expect(instr.text).toBe('persona\ntext rules\nextra'); + expect(instr.value).toBe('persona\naudio rules\nextra'); + expect(instr.asModality('text').value).toBe('persona\ntext rules\nextra'); + }); + + it('tpl preserves audio-only interpolation as audio-only output', () => { + const instr = Instructions.tpl`prefix ${new Instructions({ audio: 'same' })} suffix`; + + expect(instr.toJSON()).toEqual({ type: 'instructions', audio: 'prefix same suffix' }); + expect(instr.audio).toBe('prefix same suffix'); + expect(instr.text).toBe('prefix same suffix'); + }); + + it('tpl interpolates primitive values into both variants', () => { + const instr = Instructions.tpl`date=${'2026-05-13'} enabled=${true} count=${3}`; + + expect(instr.toJSON()).toEqual({ + type: 'instructions', + audio: 'date=2026-05-13 enabled=true count=3', + }); + expect(instr.audio).toBe('date=2026-05-13 enabled=true count=3'); + expect(instr.text).toBe('date=2026-05-13 enabled=true count=3'); + expect(instr.value).toBe('date=2026-05-13 enabled=true count=3'); + }); + + it('tpl combines multiple modality-aware interpolations', () => { + const instr = Instructions.tpl`${new Instructions({ audio: 'audio A', text: 'text A' })} / ${new Instructions({ audio: 'audio B', text: 'text B' })}`; + + expect(instr.audio).toBe('audio A / audio B'); + expect(instr.text).toBe('text A / text B'); + expect(instr.value).toBe('audio A / audio B'); + }); + + it('tpl preserves the current rendered value of resolved interpolations', () => { + const resolved = new Instructions({ audio: 'audio rules', text: 'text rules' }).asModality( + 'text', + ); + const instr = Instructions.tpl`prefix ${resolved} suffix`; + + expect(instr.audio).toBe('prefix audio rules suffix'); + expect(instr.text).toBe('prefix text rules suffix'); + expect(instr.value).toBe('prefix text rules suffix'); + }); + + it('tpl stringifies null and undefined values like template literals', () => { + const instr = Instructions.tpl`null=${null} undefined=${undefined}`; + + expect(instr.toJSON()).toEqual({ + type: 'instructions', + audio: 'null=null undefined=undefined', + }); + expect(instr.audio).toBe('null=null undefined=undefined'); + expect(instr.text).toBe('null=null undefined=undefined'); + }); + + it('serializes to a dict with both variants and round-trips through toJSON', () => { + const instr = new Instructions({ audio: 'audio variant', text: 'text variant' }); + + const ctx = new ChatContext([ChatMessage.create({ role: 'system', content: [instr] })]); + const data = ctx.toJSON(); + const items = (data.items as Record[])!; + const content = (items[0]!.content as Record[])![0]!; + + expect(content).toEqual({ + type: 'instructions', + audio: 'audio variant', + text: 'text variant', + }); + }); + + it('omits the text key in toJSON when only audio variant is provided', () => { + const instr = new Instructions({ audio: 'audio only' }); + expect(instr.toJSON()).toEqual({ type: 'instructions', audio: 'audio only' }); + }); + + it('falls back text -> audio when no text variant is provided', () => { + const instr = new Instructions({ audio: 'audio only' }); + expect(instr.audio).toBe('audio only'); + expect(instr.text).toBe('audio only'); + expect(instr.value).toBe('audio only'); + }); + + it('renderInstructions returns strings and resolved Instructions values explicitly', () => { + const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' }); + + expect(renderInstructions('plain instructions')).toBe('plain instructions'); + expect(renderInstructions(instr)).toBe('audio instructions'); + expect(renderInstructions(instr, 'audio')).toBe('audio instructions'); + expect(renderInstructions(instr, 'text')).toBe('text instructions'); + }); + + it('concatenates two Instructions, propagating both variants', () => { + const a = new Instructions({ audio: 'audio A', text: 'text A' }); + const b = new Instructions({ audio: 'audio B', text: 'text B' }); + const result = a.concat(b); + expect(result).toBeInstanceOf(Instructions); + expect(result.audio).toBe('audio Aaudio B'); + expect(result.text).toBe('text Atext B'); + }); + + it('concatenates Instructions + string, propagating both variants', () => { + const instr = new Instructions({ audio: 'audio', text: 'text' }); + const result = instr.concat(' suffix'); + expect(result.audio).toBe('audio suffix'); + expect(result.text).toBe('text suffix'); + }); + + it('concatInstructions handles string + Instructions (radd-style)', () => { + const instr = new Instructions({ audio: 'audio', text: 'text' }); + const result = concatInstructions('prefix ', instr); + expect(isInstructions(result)).toBe(true); + if (!isInstructions(result)) return; + expect(result.audio).toBe('prefix audio'); + expect(result.text).toBe('prefix text'); + }); + + it('preserves text=undefined when concatenating an audio-only instructions', () => { + const audioOnly = new Instructions({ audio: 'audio only' }); + const result = audioOnly.concat(' more'); + expect(result.toJSON()).toEqual({ type: 'instructions', audio: 'audio only more' }); + expect(result.audio).toBe('audio only more'); + expect(result.text).toBe('audio only more'); + }); + + it('when only one side has a text variant, the other contributes its audio', () => { + const a = new Instructions({ audio: 'audio A', text: 'text A' }); + const b = new Instructions({ audio: 'audio B' }); + const result = concatInstructions(a, ' ', b); + expect(isInstructions(result)).toBe(true); + if (!isInstructions(result)) return; + expect(result.audio).toBe('audio A audio B'); + expect(result.text).toBe('text A audio B'); + }); + + it('asModality returns a copy with both variants preserved', () => { + const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' }); + + let resolved = instr.asModality('audio'); + expect(resolved.value).toBe('audio instructions'); + expect(resolved.audio).toBe('audio instructions'); + expect(resolved.text).toBe('text instructions'); + + resolved = instr.asModality('text'); + expect(resolved.value).toBe('text instructions'); + expect(resolved.audio).toBe('audio instructions'); + expect(resolved.text).toBe('text instructions'); + }); + + it('can switch modality after a previous resolution', () => { + const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' }); + const resolvedText = instr.asModality('text'); + const resolvedAudio = resolvedText.asModality('audio'); + expect(resolvedAudio.value).toBe('audio instructions'); + }); + + it('asModality on audio-only Instructions returns audio for both modalities', () => { + const audioOnly = new Instructions({ audio: 'audio only' }); + expect(audioOnly.asModality('audio').value).toBe('audio only'); + expect(audioOnly.asModality('text').value).toBe('audio only'); + }); + + it('applyInstructionsModality rewrites the system message content', () => { + const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' }); + const ctx = new ChatContext([ + ChatMessage.create({ + id: INSTRUCTIONS_MESSAGE_ID, + role: 'system', + content: [instr], + }), + ]); + + applyInstructionsModality(ctx, { modality: 'audio' }); + let content = (ctx.items[0]! as ChatMessage).content[0]!; + expect(isInstructions(content) ? content.value : '').toBe('audio instructions'); + + applyInstructionsModality(ctx, { modality: 'text' }); + content = (ctx.items[0]! as ChatMessage).content[0]!; + expect(isInstructions(content) ? content.value : '').toBe('text instructions'); + }); + + it('applyInstructionsModality is a no-op when content has no Instructions', () => { + const ctx = new ChatContext([ + ChatMessage.create({ + id: INSTRUCTIONS_MESSAGE_ID, + role: 'system', + content: ['plain string instructions'], + }), + ]); + const before = (ctx.items[0]! as ChatMessage).content[0]; + applyInstructionsModality(ctx, { modality: 'text' }); + expect((ctx.items[0]! as ChatMessage).content[0]).toBe(before); + }); + + it('survives copy and lets a different modality be applied to the copy', () => { + const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' }); + const baseCtx = new ChatContext([ + ChatMessage.create({ + id: INSTRUCTIONS_MESSAGE_ID, + role: 'system', + content: [instr], + }), + ]); + const turn1 = baseCtx.copy(); + applyInstructionsModality(turn1, { modality: 'text' }); + const turn2 = turn1.copy(); + applyInstructionsModality(turn2, { modality: 'audio' }); + + const turn2Content = (turn2.items[0]! as ChatMessage).content[0]!; + expect(isInstructions(turn2Content) ? turn2Content.value : '').toBe('audio instructions'); + + // base context content is untouched (was the original instr) + expect((baseCtx.items[0]! as ChatMessage).content[0]).toBe(instr); + }); +}); diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts index 996cd12f2..743e7efb8 100644 --- a/agents/src/llm/chat_context.ts +++ b/agents/src/llm/chat_context.ts @@ -37,7 +37,197 @@ export interface AudioContent { transcript?: string; } -export type ChatContent = ImageContent | AudioContent | string; +type InstructionsOptions = { + /** The audio/voice variant of the instructions. */ + audio: string; + /** The text variant of the instructions; falls back to `audio` when omitted. */ + text?: string; + /** The currently rendered string value, used by `value`/`toString()`. */ + represent?: string; +}; + +const INSTRUCTIONS_SYMBOL = Symbol.for('livekit.agents.Instructions'); + +export function isInstructions(value: unknown): value is Instructions { + return ( + typeof value === 'object' && + value !== null && + INSTRUCTIONS_SYMBOL in value && + (value as Record)[INSTRUCTIONS_SYMBOL] === true + ); +} + +/** + * Instructions that adapt based on the user's input modality (audio vs. text). + * + * The `value` property is the rendered string providers see. By default it + * equals the `audio` variant; after {@link asModality} it equals the chosen + * variant. Both the `audio` variant and the raw `text` variant are preserved + * so {@link asModality} can be called again for a different modality (e.g., + * when the same `ChatContext` is reused across tool-call turns). + */ +export class Instructions { + readonly type = 'instructions' as const; + + private readonly _audioVariant: string; + + /** Raw text variant; falls back to {@link audio} when omitted. */ + private readonly _textVariant?: string; + + /** The currently rendered string (what providers should treat as content). */ + readonly value: string; + + /** @internal Symbol marker for type identification */ + readonly [INSTRUCTIONS_SYMBOL] = true; + + constructor(options: InstructionsOptions) { + this._audioVariant = options.audio; + this._textVariant = options.text; + this.value = options.represent ?? options.audio; + } + + static tpl( + strings: TemplateStringsArray, + ...values: Array + ): Instructions { + const render = (mode: 'audio' | 'text' | 'value') => { + let result = strings[0]!; + for (let i = 0; i < values.length; i++) { + const value = values[i]!; + if (isInstructions(value)) { + result += mode === 'audio' ? value.audio : mode === 'text' ? value.text : value.value; + } else { + result += String(value); + } + result += strings[i + 1]!; + } + return result; + }; + + const hasTextVariant = values.some( + (value) => isInstructions(value) && value._textVariant !== undefined, + ); + + return new Instructions({ + audio: render('audio'), + text: hasTextVariant ? render('text') : undefined, + represent: render('value'), + }); + } + + /** The audio (voice) variant of the instructions. */ + get audio(): string { + return this._audioVariant; + } + + /** The text variant of the instructions. Falls back to {@link audio}. */ + get text(): string { + return this._textVariant ?? this.audio; + } + + /** + * Return a copy whose {@link value} is the variant matching `modality`. + * Both `audio` and `text` variants are preserved on the result, so this can + * be called again for a different modality (e.g. across tool-call turns). + */ + asModality(modality: 'audio' | 'text'): Instructions { + return new Instructions({ + audio: this.audio, + text: this._textVariant, + represent: modality === 'audio' ? this.audio : this.text, + }); + } + + /** Concatenate, propagating both variants and the current rendered value. */ + concat(other: string | Instructions): Instructions { + if (isInstructions(other)) { + const hasText = this._textVariant !== undefined || other._textVariant !== undefined; + return new Instructions({ + audio: this.audio + other.audio, + text: hasText ? this.text + other.text : undefined, + represent: this.value + other.value, + }); + } + return new Instructions({ + audio: this.audio + other, + text: this._textVariant !== undefined ? this._textVariant + other : undefined, + represent: this.value + other, + }); + } + + toString(): string { + return this.value; + } + + toJSON(): { type: 'instructions'; audio: string; text?: string } { + const result: { type: 'instructions'; audio: string; text?: string } = { + type: 'instructions', + audio: this.audio, + }; + if (this._textVariant !== undefined) { + result.text = this._textVariant; + } + return result; + } +} + +export function renderInstructions( + instructions: string | Instructions, + modality?: 'audio' | 'text', +): string { + if (typeof instructions === 'string') return instructions; + return modality === undefined ? instructions.value : instructions.asModality(modality).value; +} + +/** + * Compare two instruction values by content. Plain strings compare by value; + * {@link Instructions} compare by their audio + text variants so that two + * distinct instances with the same content are treated as equal. + */ +export function instructionsEqual( + a: string | Instructions | undefined, + b: string | Instructions | undefined, +): boolean { + if (a === b) return true; + if (a === undefined || b === undefined) return false; + const aIsInstr = isInstructions(a); + const bIsInstr = isInstructions(b); + if (aIsInstr && bIsInstr) { + return a.audio === b.audio && a.text === b.text; + } + if (!aIsInstr && !bIsInstr) { + return a === b; + } + return false; +} + +/** + * Concatenate any mix of plain strings and {@link Instructions}, propagating + * both audio/text variants. If no argument is an {@link Instructions} the + * result is a plain string; otherwise the result is an {@link Instructions} + * preserving both variants from every contributing operand. + */ +export function concatInstructions(...parts: Array): string | Instructions { + if (parts.length === 0) return ''; + const hasInstructions = parts.some((p) => isInstructions(p)); + if (!hasInstructions) return parts.join(''); + + let acc = parts[0]!; + for (let i = 1; i < parts.length; i++) { + const next = parts[i]!; + if (isInstructions(acc)) { + acc = acc.concat(next); + } else if (isInstructions(next)) { + // string + Instructions (radd-style): prepend `acc` to both variants. + acc = new Instructions({ audio: acc }).concat(next); + } else { + acc = acc + next; + } + } + return acc; +} + +export type ChatContent = ImageContent | AudioContent | Instructions | string; export function createImageContent(params: { image: string | VideoFrame; @@ -171,7 +361,9 @@ export class ChatMessage { * lines. If no string content is present, returns `null`. */ get textContent(): string | undefined { - const parts = this.content.filter((c): c is string => typeof c === 'string'); + const parts = this.content + .filter((c): c is string | Instructions => typeof c === 'string' || isInstructions(c)) + .map((c) => (typeof c === 'string' ? c : c.value)); return parts.length > 0 ? parts.join('\n') : undefined; } @@ -179,6 +371,8 @@ export class ChatMessage { return this.content.map((c) => { if (typeof c === 'string') { return c as JSONValue; + } else if (isInstructions(c)) { + return c.toJSON() as JSONValue; } else if (c.type === 'image_content') { return { id: c.id, @@ -456,7 +650,7 @@ export class AgentConfigUpdate { readonly type = 'agent_config_update' as const; - instructions?: string; + instructions?: string | Instructions; toolsAdded?: string[]; @@ -467,7 +661,7 @@ export class AgentConfigUpdate { constructor( params: { id?: string; - instructions?: string; + instructions?: string | Instructions; toolsAdded?: string[]; toolsRemoved?: string[]; createdAt?: number; @@ -489,7 +683,7 @@ export class AgentConfigUpdate { static create(params: { id?: string; - instructions?: string; + instructions?: string | Instructions; toolsAdded?: string[]; toolsRemoved?: string[]; createdAt?: number; @@ -504,7 +698,9 @@ export class AgentConfigUpdate { }; if (this.instructions !== undefined) { - result.instructions = this.instructions; + result.instructions = isInstructions(this.instructions) + ? (this.instructions.toJSON() as JSONValue) + : this.instructions; } if (this.toolsAdded !== undefined) { result.toolsAdded = this.toolsAdded; @@ -891,6 +1087,21 @@ export class ChatContext { return false; } + if (isInstructions(contentA) && isInstructions(contentB)) { + if ( + contentA.audio !== contentB.audio || + contentA.text !== contentB.text || + contentA.value !== contentB.value + ) { + return false; + } + continue; + } + + if (isInstructions(contentA) || isInstructions(contentB)) { + return false; + } + if (typeof contentA === 'object' && typeof contentB === 'object') { if (contentA.type === 'image_content' && contentB.type === 'image_content') { if ( diff --git a/agents/src/llm/index.ts b/agents/src/llm/index.ts index ae6f75b9c..950296c9c 100644 --- a/agents/src/llm/index.ts +++ b/agents/src/llm/index.ts @@ -22,6 +22,8 @@ export { AgentConfigUpdate, ChatContext, ChatMessage, + Instructions, + concatInstructions, createAudioContent, createImageContent, FunctionCall, diff --git a/agents/src/llm/provider_format/google.test.ts b/agents/src/llm/provider_format/google.test.ts index a86d8460b..c59e52045 100644 --- a/agents/src/llm/provider_format/google.test.ts +++ b/agents/src/llm/provider_format/google.test.ts @@ -9,6 +9,7 @@ import { ChatContext, FunctionCall, FunctionCallOutput, + Instructions, } from '../chat_context.js'; import { serializeImage } from '../utils.js'; import { toChatCtx } from './google.js'; @@ -62,6 +63,23 @@ describe('Google Provider Format - toChatCtx', () => { expect(formatData.systemMessages).toEqual(['You are a helpful assistant']); }); + it('should render Instructions as their resolved value', async () => { + const ctx = ChatContext.empty(); + ctx.addMessage({ + role: 'system', + content: [ + new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality( + 'text', + ), + ], + }); + ctx.addMessage({ role: 'user', content: 'Hello' }); + + const [, formatData] = await toChatCtx(ctx, false); + + expect(formatData.systemMessages).toEqual(['text instructions']); + }); + it('should handle multiple system messages', async () => { const ctx = ChatContext.empty(); ctx.addMessage({ role: 'system', content: 'You are a helpful assistant' }); diff --git a/agents/src/llm/provider_format/google.ts b/agents/src/llm/provider_format/google.ts index 45c68295d..fbfdc4d01 100644 --- a/agents/src/llm/provider_format/google.ts +++ b/agents/src/llm/provider_format/google.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import type { ChatContext, ChatItem, ImageContent } from '../chat_context.js'; +import { isInstructions } from '../chat_context.js'; import { type SerializedImage, serializeImage } from '../utils.js'; import { groupToolCalls } from './utils.js'; @@ -57,6 +58,8 @@ export async function toChatCtx( for (const content of msg.content) { if (content && typeof content === 'string') { parts.push({ text: content }); + } else if (isInstructions(content)) { + parts.push({ text: content.value }); } else if (content && typeof content === 'object') { if (content.type === 'image_content') { parts.push(await toImagePart(content)); diff --git a/agents/src/llm/provider_format/mistralai.test.ts b/agents/src/llm/provider_format/mistralai.test.ts index 098195357..db7387637 100644 --- a/agents/src/llm/provider_format/mistralai.test.ts +++ b/agents/src/llm/provider_format/mistralai.test.ts @@ -8,6 +8,7 @@ import { ChatContext, FunctionCall, FunctionCallOutput, + Instructions, } from '../chat_context.js'; import { toChatCtx } from './mistralai.js'; @@ -39,6 +40,23 @@ describe('Mistral Provider Format - toChatCtx', () => { expect(formatData.instructions).toBe('You are a helpful assistant'); }); + it('should render Instructions as their resolved value', () => { + const ctx = ChatContext.empty(); + ctx.addMessage({ + role: 'system', + content: [ + new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality( + 'text', + ), + ], + }); + ctx.addMessage({ role: 'user', content: 'Hello' }); + + const [, formatData] = toChatCtx(ctx); + + expect(formatData.instructions).toBe('text instructions'); + }); + it('should extract developer messages as instructions', () => { const ctx = ChatContext.empty(); ctx.addMessage({ role: 'developer', content: 'Be concise' }); diff --git a/agents/src/llm/provider_format/mistralai.ts b/agents/src/llm/provider_format/mistralai.ts index 15234a5cd..776be3085 100644 --- a/agents/src/llm/provider_format/mistralai.ts +++ b/agents/src/llm/provider_format/mistralai.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import type { ChatContext } from '../chat_context.js'; +import { isInstructions } from '../chat_context.js'; export interface MistralFormatData { instructions: string; @@ -27,7 +28,10 @@ export function toChatCtx( for (const item of chatCtx.items) { if (item.type === 'message') { - const text = item.content.filter((c): c is string => typeof c === 'string').join('\n'); + const text = item.content + .filter((c) => typeof c === 'string' || isInstructions(c)) + .map((c) => (typeof c === 'string' ? c : c.value)) + .join('\n'); if (item.role === 'system' || item.role === 'developer') { instructionParts.push(text); diff --git a/agents/src/llm/provider_format/openai.test.ts b/agents/src/llm/provider_format/openai.test.ts index 97cb5dcca..6a2a8aed9 100644 --- a/agents/src/llm/provider_format/openai.test.ts +++ b/agents/src/llm/provider_format/openai.test.ts @@ -9,6 +9,7 @@ import { ChatContext, FunctionCall, FunctionCallOutput, + Instructions, } from '../chat_context.js'; import { serializeImage } from '../utils.js'; import { toChatCtx, toResponsesChatCtx } from './openai.js'; @@ -54,6 +55,23 @@ describe('toChatCtx', () => { expect(result[1]).toEqual({ role: 'user', content: 'Hello' }); }); + it('should render Instructions as their resolved value', async () => { + const ctx = ChatContext.empty(); + ctx.addMessage({ + role: 'system', + content: [ + new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality( + 'text', + ), + ], + }); + ctx.addMessage({ role: 'user', content: 'Hello' }); + + const result = await toChatCtx(ctx); + + expect(result[0]).toEqual({ role: 'system', content: 'text instructions' }); + }); + it('should handle multi-line text content', async () => { const ctx = ChatContext.empty(); ctx.addMessage({ role: 'user', content: ['Line 1', 'Line 2', 'Line 3'] }); @@ -707,6 +725,23 @@ describe('toResponsesChatCtx', () => { expect(result[1]).toEqual({ role: 'user', content: 'Hello' }); }); + it('should render Instructions as their resolved value', async () => { + const ctx = ChatContext.empty(); + ctx.addMessage({ + role: 'system', + content: [ + new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality( + 'text', + ), + ], + }); + ctx.addMessage({ role: 'user', content: 'Hello' }); + + const result = await toResponsesChatCtx(ctx); + + expect(result[0]).toEqual({ role: 'system', content: 'text instructions' }); + }); + it('should handle multi-line text content', async () => { const ctx = ChatContext.empty(); ctx.addMessage({ role: 'user', content: ['Line 1', 'Line 2', 'Line 3'] }); diff --git a/agents/src/llm/provider_format/openai.ts b/agents/src/llm/provider_format/openai.ts index 9d61e169d..9a67c0f31 100644 --- a/agents/src/llm/provider_format/openai.ts +++ b/agents/src/llm/provider_format/openai.ts @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 import type { ChatContext, ChatItem, ImageContent } from '../chat_context.js'; +import { isInstructions } from '../chat_context.js'; import { type SerializedImage, serializeImage } from '../utils.js'; import { groupToolCalls } from './utils.js'; @@ -69,6 +70,9 @@ async function toChatItem(item: ChatItem) { if (typeof content === 'string') { if (textContent) textContent += '\n'; textContent += content; + } else if (isInstructions(content)) { + if (textContent) textContent += '\n'; + textContent += content.value; } else if (content.type === 'image_content') { listContent.push(await toImageContent(content)); } else { @@ -229,6 +233,9 @@ async function toResponsesChatItem(item: ChatItem) { if (typeof content === 'string') { if (textContent) textContent += '\n'; textContent += content; + } else if (isInstructions(content)) { + if (textContent) textContent += '\n'; + textContent += content.value; } else if (content.type === 'image_content') { listContent.push(await toResponsesImageContent(content)); } else { diff --git a/agents/src/llm/utils.ts b/agents/src/llm/utils.ts index c5dbcd40c..0271deb2a 100644 --- a/agents/src/llm/utils.ts +++ b/agents/src/llm/utils.ts @@ -364,6 +364,10 @@ export function validateChatContextStructure(chatCtx: ChatContext): ChatContextV return; } + if (term.type === 'instructions') { + return; + } + if (term.type === 'image_content') { if (!term.id || term.image === undefined || term.image === null) { pushIssue({ @@ -514,6 +518,10 @@ function formatMessageContentPart(part: ChatContent): string { return part; } + if (part.type === 'instructions') { + return part.value; + } + if (part.type === 'image_content') { if (typeof part.image === 'string') { return `[image url=${truncateText(part.image, 120)}]`; diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts index e2e4a35b9..166e4ff24 100644 --- a/agents/src/telemetry/traces.ts +++ b/agents/src/telemetry/traces.ts @@ -23,6 +23,7 @@ import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions'; import FormData from 'form-data'; import { AccessToken } from 'livekit-server-sdk'; import fs from 'node:fs/promises'; +import { isInstructions, renderInstructions } from '../llm/chat_context.js'; import type { ChatContent, ChatItem, ChatRole } from '../llm/index.js'; import { enableOtelLogging } from '../log.js'; import { filterZeroValues } from '../metrics/model_usage.js'; @@ -373,7 +374,9 @@ function chatItemToProto(item: ChatItem): ProtoChatItem { const msg: ProtoMessage = { id: item.id, role: ROLE_MAP[item.role] ?? (item.role.toUpperCase() as ProtoRole), - content: item.content.map((c: ChatContent) => ({ text: c })), + content: item.content.map((c: ChatContent) => ({ + text: isInstructions(c) ? c.value : c, + })), createdAt: toRFC3339(item.createdAt), }; @@ -456,7 +459,7 @@ function chatItemToProto(item: ChatItem): ProtoChatItem { createdAt: toRFC3339(item.createdAt), }; if (item.instructions !== undefined) { - configUpdate.instructions = item.instructions; + configUpdate.instructions = renderInstructions(item.instructions); } if (item.toolsAdded !== undefined) { configUpdate.toolsAdded = item.toolsAdded; diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts index 876170636..890d7ea7d 100644 --- a/agents/src/voice/agent.ts +++ b/agents/src/voice/agent.ts @@ -12,7 +12,7 @@ import { type STTModelString, type TTSModelString, } from '../inference/index.js'; -import { ReadonlyChatContext } from '../llm/chat_context.js'; +import { type Instructions, ReadonlyChatContext } from '../llm/chat_context.js'; import type { ChatMessage, FunctionCall } from '../llm/index.js'; import { type ChatChunk, @@ -117,7 +117,7 @@ export type TTSPronunciationMap = Record; export interface AgentOptions { id?: string; - instructions: string; + instructions: string | Instructions; chatCtx?: ChatContext; tools?: ToolContext; stt?: STT | STTModelString; @@ -154,7 +154,7 @@ export class Agent { _chatCtx: ChatContext; /** @internal */ - _instructions: string; + _instructions: string | Instructions; /** @internal */ _tools?: ToolContext; @@ -264,7 +264,7 @@ export class Agent { return this._id; } - get instructions(): string { + get instructions(): string | Instructions { return this._instructions; } diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts index d77197c19..e53f39b14 100644 --- a/agents/src/voice/agent_activity.ts +++ b/agents/src/voice/agent_activity.ts @@ -17,7 +17,11 @@ import { AgentConfigUpdate, type ChatContext, ChatMessage, + type Instructions, type MetricsReport, + concatInstructions, + instructionsEqual, + renderInstructions, } from '../llm/chat_context.js'; import { type ChatItem, @@ -94,6 +98,7 @@ import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './gene import { type _AudioOut, type _TextOut, + applyInstructionsModality, performAudioForwarding, performLLMInference, performTTSInference, @@ -103,7 +108,7 @@ import { updateInstructions, } from './generation.js'; import type { TimedString } from './io.js'; -import { SpeechHandle } from './speech_handle.js'; +import { type InputDetails, SpeechHandle } from './speech_handle.js'; import { createEndpointing } from './turn_config/endpointing.js'; import { setParticipantSpanAttributes } from './utils.js'; @@ -447,10 +452,12 @@ export class AgentActivity implements RecognitionHooks { // this means the content is the same as the previous session const capabilities = this.llm.capabilities; try { - await this.realtimeSession!._updateSession( + const realtimeInstructions = !rtReused || capabilities.midSessionInstructionsUpdate - ? this.agent.instructions - : undefined, + ? renderInstructions(this.agent.instructions) + : undefined; + await this.realtimeSession!._updateSession( + realtimeInstructions, !rtReused || capabilities.midSessionChatCtxUpdate ? this.agent.chatCtx : undefined, !rtReused || capabilities.midSessionToolsUpdate ? this.tools : undefined, ); @@ -597,7 +604,7 @@ export class AgentActivity implements RecognitionHooks { reusable = reusable && (capabilities.midSessionInstructionsUpdate || - this.agent.instructions === newActivity.agent.instructions); + instructionsEqual(this.agent.instructions, newActivity.agent.instructions)); // tools update is supported or tools are the same reusable = @@ -1406,6 +1413,7 @@ export class AgentActivity implements RecognitionHooks { userMessage, chatCtx, scheduleSpeech: false, + inputDetails: { modality: 'audio' }, }); this._preemptiveGeneration = { @@ -1630,10 +1638,11 @@ export class AgentActivity implements RecognitionHooks { generateReply(options: { userMessage?: ChatMessage; chatCtx?: ChatContext; - instructions?: string; + instructions?: string | Instructions; toolChoice?: ToolChoice | null; allowInterruptions?: boolean; scheduleSpeech?: boolean; + inputDetails?: InputDetails; }): SpeechHandle { const { userMessage, @@ -1642,9 +1651,10 @@ export class AgentActivity implements RecognitionHooks { toolChoice: defaultToolChoice, allowInterruptions: defaultAllowInterruptions, scheduleSpeech = true, + inputDetails, } = options; - let instructions = defaultInstructions; + let instructions: string | Instructions | undefined = defaultInstructions; let toolChoice = defaultToolChoice; let allowInterruptions = defaultAllowInterruptions; @@ -1672,6 +1682,7 @@ export class AgentActivity implements RecognitionHooks { const handle = SpeechHandle.create({ allowInterruptions: allowInterruptions ?? this.allowInterruptions, + inputDetails, }); this.agentSession.emit( @@ -1706,7 +1717,7 @@ export class AgentActivity implements RecognitionHooks { // this matches the behavior of the Realtime API: // https://platform.openai.com/docs/api-reference/realtime-client-events/response/create if (instructions) { - instructions = `${this.agent.instructions}\n${instructions}`; + instructions = concatInstructions(this.agent.instructions, '\n', instructions); } // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter @@ -1933,7 +1944,11 @@ export class AgentActivity implements RecognitionHooks { if (speechHandle === undefined) { // Ensure the new message is passed to generateReply // This preserves the original message id, making it easier for users to track responses - speechHandle = this.generateReply({ userMessage, chatCtx }); + speechHandle = this.generateReply({ + userMessage, + chatCtx, + inputDetails: { modality: 'audio' }, + }); } const eouMetrics: EOUMetrics = { @@ -2140,7 +2155,7 @@ export class AgentActivity implements RecognitionHooks { toolCtx: ToolContext; modelSettings: ModelSettings; replyAbortController: AbortController; - instructions?: string; + instructions?: string | Instructions; newMessage?: ChatMessage; toolsMessages?: ChatItem[]; span: Span; @@ -2150,7 +2165,7 @@ export class AgentActivity implements RecognitionHooks { span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id); if (instructions) { - span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions); + span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, renderInstructions(instructions)); } if (newMessage) { span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || ''); @@ -2189,6 +2204,9 @@ export class AgentActivity implements RecognitionHooks { } } + // apply the correct variant of the instructions for the turn's input modality + applyInstructionsModality(chatCtx, { modality: speechHandle.inputDetails.modality }); + const tasks: Array> = []; const [llmTask, llmGenData] = performLLMInference( // preserve `this` context in llmNode @@ -2628,7 +2646,7 @@ export class AgentActivity implements RecognitionHooks { toolCtx: ToolContext, modelSettings: ModelSettings, replyAbortController: AbortController, - instructions?: string, + instructions?: string | Instructions, newMessage?: ChatMessage, toolsMessages?: ChatItem[], _previousUserMetrics?: MetricsReport, @@ -3234,7 +3252,7 @@ export class AgentActivity implements RecognitionHooks { modelSettings: ModelSettings; abortController: AbortController; userInput?: string; - instructions?: string; + instructions?: string | Instructions; }): Promise { speechHandleStorage.enterWith(speechHandle); @@ -3261,7 +3279,11 @@ export class AgentActivity implements RecognitionHooks { } try { - const generationEvent = await this.realtimeSession.generateReply(instructions); + const generationEvent = await this.realtimeSession.generateReply( + instructions !== undefined + ? renderInstructions(instructions, speechHandle.inputDetails.modality) + : undefined, + ); await this.realtimeGenerationTask( speechHandle, generationEvent, diff --git a/agents/src/voice/agent_session.test.ts b/agents/src/voice/agent_session.test.ts new file mode 100644 index 000000000..acb3c4a2a --- /dev/null +++ b/agents/src/voice/agent_session.test.ts @@ -0,0 +1,24 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { describe, expect, it, vi } from 'vitest'; +import { AgentSession } from './agent_session.js'; +import { SpeechHandle } from './speech_handle.js'; + +describe('AgentSession.run', () => { + it('forwards inputModality to generateReply', async () => { + const session = new AgentSession(); + const generateReply = vi + .spyOn(session, 'generateReply') + .mockImplementation(() => SpeechHandle.create()); + + session.run({ userInput: 'hello', inputModality: 'audio' }); + + await vi.waitFor(() => { + expect(generateReply).toHaveBeenCalledWith({ + userInput: 'hello', + inputModality: 'audio', + }); + }); + }); +}); diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts index d5382562d..a495e97c7 100644 --- a/agents/src/voice/agent_session.ts +++ b/agents/src/voice/agent_session.ts @@ -22,7 +22,12 @@ import type { InterruptionDetectionError } from '../inference/interruption/error import type { OverlappingSpeechEvent } from '../inference/interruption/types.js'; import { getJobContext } from '../job.js'; import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js'; -import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js'; +import { + AgentHandoffItem, + ChatContext, + ChatMessage, + type Instructions, +} from '../llm/chat_context.js'; import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js'; import type { LLMError } from '../llm/llm.js'; import { log } from '../log.js'; @@ -752,9 +757,11 @@ export class AgentSession< generateReply(options?: { userInput?: string | ChatMessage; chatCtx?: ChatContext; - instructions?: string; + instructions?: string | Instructions; toolChoice?: ToolChoice; allowInterruptions?: boolean; + /** The input modality used for generating the reply. Defaults to `"text"`. */ + inputModality?: 'audio' | 'text'; }): SpeechHandle { if (!this.activity) { throw new Error('AgentSession is not running'); @@ -770,18 +777,20 @@ export class AgentSession< }) : undefined; + const inputDetails = { modality: options?.inputModality ?? 'text' } as const; + const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => { if (activity.schedulingPaused) { if (!nextActivity) { throw new Error('AgentSession is closing, cannot use generateReply()'); } - return nextActivity.generateReply({ userMessage, ...options }); + return nextActivity.generateReply({ userMessage, ...options, inputDetails }); } // Handoff can race with scheduling pause between the check above and generateReply(). // If that happens, retry on the next activity instead of surfacing an avoidable error. try { - return activity.generateReply({ userMessage, ...options }); + return activity.generateReply({ userMessage, ...options, inputDetails }); } catch (error) { const canFallback = nextActivity !== undefined && isSchedulingPausedError(error); if (!canFallback) { @@ -791,7 +800,7 @@ export class AgentSession< { error }, 'generateReply scheduling raced with handoff drain; retrying on next activity', ); - return nextActivity.generateReply({ userMessage, ...options }); + return nextActivity.generateReply({ userMessage, ...options, inputDetails }); } }; @@ -831,9 +840,11 @@ export class AgentSession< */ run({ userInput, + inputModality, outputType, }: { userInput: string; + inputModality?: 'audio' | 'text'; outputType?: z.ZodType; }): RunResult { if (this._globalRunState && !this._globalRunState.done()) { @@ -857,7 +868,7 @@ export class AgentSession< try { const unlock = await this.activityLock.lock(); unlock(); - this.generateReply({ userInput }); + this.generateReply({ userInput, inputModality }); } catch (e) { runState._reject(asError(e)); } diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts index bc06f368e..3d938abdc 100644 --- a/agents/src/voice/generation.ts +++ b/agents/src/voice/generation.ts @@ -7,11 +7,13 @@ import { ThrowsPromise } from '@livekit/throws-transformer/throws'; import type { Span } from '@opentelemetry/api'; import { context as otelContext } from '@opentelemetry/api'; import type { ReadableStream, ReadableStreamDefaultReader } from 'stream/web'; +import type { Instructions } from '../llm/chat_context.js'; import { type ChatContext, ChatMessage, FunctionCall, FunctionCallOutput, + isInstructions, } from '../llm/chat_context.js'; import type { ChatChunk } from '../llm/llm.js'; import { @@ -380,7 +382,7 @@ export function createToolOutput(params: { }); } -const INSTRUCTIONS_MESSAGE_ID = 'lk.agent_task.instructions'; +export const INSTRUCTIONS_MESSAGE_ID = 'lk.agent_task.instructions'; /** * Update the instruction message in the chat context or insert a new one if missing. @@ -395,7 +397,7 @@ const INSTRUCTIONS_MESSAGE_ID = 'lk.agent_task.instructions'; */ export function updateInstructions(options: { chatCtx: ChatContext; - instructions: string; + instructions: string | Instructions; addIfMissing: boolean; }) { const { chatCtx, instructions, addIfMissing } = options; @@ -425,6 +427,43 @@ export function updateInstructions(options: { } } +/** + * Apply the correct {@link Instructions} variant for the turn's input modality. + * + * Locates the instructions message (by {@link INSTRUCTIONS_MESSAGE_ID}) and, + * if its content contains any {@link Instructions} entries, rebuilds the + * message so each Instructions renders as the chosen variant. No-op when no + * modality-aware instructions are present. + */ +export function applyInstructionsModality( + chatCtx: ChatContext, + options: { modality: 'audio' | 'text' }, +) { + const { modality } = options; + const idx = chatCtx.indexById(INSTRUCTIONS_MESSAGE_ID); + if (idx === undefined) return; + + const item = chatCtx.items[idx]!; + if (item.type !== 'message') return; + + const hasModalitySpecific = item.content.some((c) => isInstructions(c)); + if (!hasModalitySpecific) return; + + // ChatContext.copy shadows the original item; create a new instance so the + // base context's content isn't mutated when the same Instructions is reused + // across turns. + chatCtx.items[idx] = ChatMessage.create({ + id: item.id, + role: item.role, + content: item.content.map((c) => (isInstructions(c) ? c.asModality(modality) : c)), + interrupted: item.interrupted, + createdAt: item.createdAt, + transcriptConfidence: item.transcriptConfidence, + metrics: item.metrics, + extra: item.extra, + }); +} + export function performLLMInference( node: LLMNode, chatCtx: ChatContext, diff --git a/agents/src/voice/remote_session.ts b/agents/src/voice/remote_session.ts index b8cb359a8..f970b064e 100644 --- a/agents/src/voice/remote_session.ts +++ b/agents/src/voice/remote_session.ts @@ -14,6 +14,7 @@ import type { FunctionCall as FCItem, FunctionCallOutput as FCOItem, } from '../llm/chat_context.js'; +import { isInstructions, renderInstructions } from '../llm/chat_context.js'; import type { ToolContext } from '../llm/tool_context.js'; import { log } from '../log.js'; import type { @@ -295,6 +296,10 @@ function chatItemToProto(item: RemoteChatItem): pb.ChatContext_ChatItem { for (const c of msg.content) { if (typeof c === 'string') { content.push(new pb.ChatMessage_ChatContent({ payload: { case: 'text', value: c } })); + } else if (isInstructions(c)) { + content.push( + new pb.ChatMessage_ChatContent({ payload: { case: 'text', value: c.value } }), + ); } } @@ -800,7 +805,7 @@ export class SessionHost { case: 'getAgentInfo', value: new pb.SessionResponse_GetAgentInfoResponse({ id: agent.id, - instructions: agent.instructions, + instructions: renderInstructions(agent.instructions), tools: toolNames(agent.toolCtx), chatCtx: chatItemsToProto(agent.chatCtx.items), }), diff --git a/agents/src/voice/speech_handle.ts b/agents/src/voice/speech_handle.ts index 3e3ccf540..645729fab 100644 --- a/agents/src/voice/speech_handle.ts +++ b/agents/src/voice/speech_handle.ts @@ -55,6 +55,17 @@ export class SpeechHandleCircularWaitError extends Error { } } +/** + * Describes how the user provided input that triggered the current turn. + * Used by modality-aware Instructions to pick the correct variant. + */ +export interface InputDetails { + modality: 'audio' | 'text'; +} + +/** Default {@link InputDetails} used when no explicit value is provided. */ +export const DEFAULT_INPUT_DETAILS: InputDetails = { modality: 'audio' }; + export class SpeechHandle { /** Priority for messages that should be played after all other messages in the queue */ static SPEECH_PRIORITY_LOW = 0; @@ -93,6 +104,7 @@ export class SpeechHandle { private _allowInterruptions: boolean, /** @internal */ public _stepIndex: number, + private _inputDetails: InputDetails = DEFAULT_INPUT_DETAILS, readonly parent?: SpeechHandle, ) { this.doneFut.await.finally(() => { @@ -105,11 +117,27 @@ export class SpeechHandle { static create(options?: { allowInterruptions?: boolean; stepIndex?: number; + inputDetails?: InputDetails; parent?: SpeechHandle; }) { - const { allowInterruptions = true, stepIndex = 0, parent } = options ?? {}; + const { + allowInterruptions = true, + stepIndex = 0, + inputDetails = DEFAULT_INPUT_DETAILS, + parent, + } = options ?? {}; + + return new SpeechHandle( + shortuuid('speech_'), + allowInterruptions, + stepIndex, + inputDetails, + parent, + ); + } - return new SpeechHandle(shortuuid('speech_'), allowInterruptions, stepIndex, parent); + get inputDetails(): InputDetails { + return this._inputDetails; } get interrupted(): boolean { diff --git a/examples/src/instructions_per_modality.ts b/examples/src/instructions_per_modality.ts new file mode 100644 index 000000000..71f2f3f04 --- /dev/null +++ b/examples/src/instructions_per_modality.ts @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + type JobContext, + type JobProcess, + ServerOptions, + cli, + defineAgent, + inference, + llm, + log, + voice, +} from '@livekit/agents'; +import * as silero from '@livekit/agents-plugin-silero'; +import { fileURLToPath } from 'node:url'; +import { z } from 'zod'; + +const BASE_INSTRUCTIONS = (modalitySpecific: string, currentDate: string) => + `You are a scheduling assistant named Alex that helps users book appointments. +${modalitySpecific} +Call \`book_appointment\` to finalise the booking. +Never invent or assume details the user did not provide — ask for them instead. +The current date is ${currentDate}. +`; + +// Voice users speak in approximate, self-correcting natural language. +// The LLM needs guidance on how to parse what was said, not how to say things back. +const AUDIO_SPECIFIC = ` +The user is speaking — their input arrives as voice transcription and may be imperfect. +When interpreting what the user said: +- Resolve relative spoken expressions to a concrete date/time: 'next Tuesday', 'tomorrow afternoon', 'the week after next around 3'. +- Spoken numbers may be ambiguous: 'three thirty' could mean 3:30 PM or the 30th of March — ask for clarification when context does not make it obvious. +- Honor verbal self-corrections: if the user says 'wait, I meant Thursday not Tuesday', update your understanding to Thursday and discard Tuesday. +- Ignore filler words and hesitations ('um', 'uh', 'like', 'I guess'). +- Always confirm the resolved date and time out loud before booking, since spoken input is inherently ambiguous. +`; + +// Text users type precise values — no need to normalise spoken patterns. +const TEXT_SPECIFIC = ` +The user is typing — take their input literally. +When interpreting what the user wrote: +- Accept exact dates and times in any common format (ISO, natural language, 12-hour or 24-hour clock). +- If the user provides a complete and unambiguous date and time, you may book immediately without asking for confirmation. +- Only ask follow-up questions for genuinely missing information. +`; + +class SchedulingAgent extends voice.Agent { + constructor() { + const now = new Date(); + const weekday = now.toLocaleDateString(undefined, { weekday: 'long' }); + const currentDate = `${now.toISOString().slice(0, 10)} ${weekday}`; + const instructions = new llm.Instructions({ + audio: BASE_INSTRUCTIONS(AUDIO_SPECIFIC, currentDate), + text: BASE_INSTRUCTIONS(TEXT_SPECIFIC, currentDate), + }); + + super({ + instructions, + tools: { + bookAppointment: llm.tool({ + description: 'Book an appointment.', + parameters: z.object({ + date: z.string().describe('The date of the appointment in the format YYYY-MM-DD'), + time: z.string().describe('The time of the appointment in the format HH:MM'), + }), + execute: async ({ date, time }) => { + log().info(`booking appointment for ${date} at ${time}`); + return `Appointment booked for ${date} at ${time}`; + }, + }), + }, + }); + } + + async onEnter(): Promise { + this.session.generateReply(); + } +} + +export default defineAgent({ + prewarm: async (proc: JobProcess) => { + proc.userData.vad = await silero.VAD.load(); + }, + entry: async (ctx: JobContext) => { + const session = new voice.AgentSession({ + vad: ctx.proc.userData.vad! as silero.VAD, + stt: new inference.STT({ model: 'deepgram/nova-3' }), + llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }), + tts: new inference.TTS({ + model: 'cartesia/sonic-3', + voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc', + }), + }); + + await session.start({ agent: new SchedulingAgent(), room: ctx.room }); + }, +}); + +cli.runApp(new ServerOptions({ agent: fileURLToPath(import.meta.url) }));