Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .changeset/modality-aware-instructions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
'@livekit/agents': patch
---

feat(agents): add modality-aware `Instructions` with audio/text variants

Introduce a new `Instructions` class for system prompts that adapt to the
user's input modality. The pipeline now applies the matching variant before
each LLM turn based on `SpeechHandle.inputDetails.modality`, and
`AgentSession.generateReply()` and `AgentSession.run()` expose an
`inputModality` option. `Instructions.tpl` supports JS-native prompt
composition while preserving audio/text variants.
246 changes: 245 additions & 1 deletion agents/src/llm/chat_context.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// SPDX-License-Identifier: Apache-2.0
import { describe, expect, it } from 'vitest';
import { initializeLogger } from '../log.js';
import { INSTRUCTIONS_MESSAGE_ID, applyInstructionsModality } from '../voice/generation.js';
import { FakeLLM } from '../voice/testing/fake_llm.js';
import {
type AudioContent,
Expand All @@ -12,7 +13,11 @@ import {
FunctionCall,
FunctionCallOutput,
type ImageContent,
Instructions,
ReadonlyChatContext,
concatInstructions,
isInstructions,
renderInstructions,
} from './chat_context.js';

initializeLogger({ pretty: false, level: 'error' });
Expand Down Expand Up @@ -434,7 +439,7 @@ describe('ChatContext._summarize', () => {
item.extra?.is_summary !== true,
);
expect(rawTailMessages).toHaveLength(4);
expect(rawTailMessages.map((item) => item.textContent)).toEqual([
expect(rawTailMessages.map((item) => (item as ChatMessage).textContent)).toEqual([
'Order #123',
'Found your order. Let me check the warranty.',
'Thanks.',
Expand Down Expand Up @@ -1235,3 +1240,242 @@ describe('ChatContext.isEquivalent', () => {
});
});
});

describe('Instructions', () => {
it('constructs from an object with audio and text variants', () => {
const instr = new Instructions({ audio: 'audio variant', text: 'text variant' });

expect(instr.audio).toBe('audio variant');
expect(instr.text).toBe('text variant');
expect(instr.value).toBe('audio variant');
});

it('identifies Instructions with a type guard', () => {
const instr = new Instructions({ audio: 'audio variant', text: 'text variant' });

expect(isInstructions(instr)).toBe(true);
expect(isInstructions('audio variant')).toBe(false);
expect(isInstructions({ type: 'instructions', audio: 'audio variant' })).toBe(false);
});

it('tpl propagates Instructions interpolations into audio and text variants', () => {
const instr = Instructions.tpl`persona
${new Instructions({ audio: 'audio rules', text: 'text rules' })}
extra`;

expect(instr).toBeInstanceOf(Instructions);
expect(instr.audio).toBe('persona\naudio rules\nextra');
expect(instr.text).toBe('persona\ntext rules\nextra');
expect(instr.value).toBe('persona\naudio rules\nextra');
expect(instr.asModality('text').value).toBe('persona\ntext rules\nextra');
});

it('tpl preserves audio-only interpolation as audio-only output', () => {
const instr = Instructions.tpl`prefix ${new Instructions({ audio: 'same' })} suffix`;

expect(instr.toJSON()).toEqual({ type: 'instructions', audio: 'prefix same suffix' });
expect(instr.audio).toBe('prefix same suffix');
expect(instr.text).toBe('prefix same suffix');
});

it('tpl interpolates primitive values into both variants', () => {
const instr = Instructions.tpl`date=${'2026-05-13'} enabled=${true} count=${3}`;

expect(instr.toJSON()).toEqual({
type: 'instructions',
audio: 'date=2026-05-13 enabled=true count=3',
});
expect(instr.audio).toBe('date=2026-05-13 enabled=true count=3');
expect(instr.text).toBe('date=2026-05-13 enabled=true count=3');
expect(instr.value).toBe('date=2026-05-13 enabled=true count=3');
});

it('tpl combines multiple modality-aware interpolations', () => {
const instr = Instructions.tpl`${new Instructions({ audio: 'audio A', text: 'text A' })} / ${new Instructions({ audio: 'audio B', text: 'text B' })}`;

expect(instr.audio).toBe('audio A / audio B');
expect(instr.text).toBe('text A / text B');
expect(instr.value).toBe('audio A / audio B');
});

it('tpl preserves the current rendered value of resolved interpolations', () => {
const resolved = new Instructions({ audio: 'audio rules', text: 'text rules' }).asModality(
'text',
);
const instr = Instructions.tpl`prefix ${resolved} suffix`;

expect(instr.audio).toBe('prefix audio rules suffix');
expect(instr.text).toBe('prefix text rules suffix');
expect(instr.value).toBe('prefix text rules suffix');
});

it('tpl stringifies null and undefined values like template literals', () => {
const instr = Instructions.tpl`null=${null} undefined=${undefined}`;

expect(instr.toJSON()).toEqual({
type: 'instructions',
audio: 'null=null undefined=undefined',
});
expect(instr.audio).toBe('null=null undefined=undefined');
expect(instr.text).toBe('null=null undefined=undefined');
});

it('serializes to a dict with both variants and round-trips through toJSON', () => {
const instr = new Instructions({ audio: 'audio variant', text: 'text variant' });

const ctx = new ChatContext([ChatMessage.create({ role: 'system', content: [instr] })]);
const data = ctx.toJSON();
const items = (data.items as Record<string, unknown>[])!;
const content = (items[0]!.content as Record<string, unknown>[])![0]!;

expect(content).toEqual({
type: 'instructions',
audio: 'audio variant',
text: 'text variant',
});
});

it('omits the text key in toJSON when only audio variant is provided', () => {
const instr = new Instructions({ audio: 'audio only' });
expect(instr.toJSON()).toEqual({ type: 'instructions', audio: 'audio only' });
});

it('falls back text -> audio when no text variant is provided', () => {
const instr = new Instructions({ audio: 'audio only' });
expect(instr.audio).toBe('audio only');
expect(instr.text).toBe('audio only');
expect(instr.value).toBe('audio only');
});

it('renderInstructions returns strings and resolved Instructions values explicitly', () => {
const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });

expect(renderInstructions('plain instructions')).toBe('plain instructions');
expect(renderInstructions(instr)).toBe('audio instructions');
expect(renderInstructions(instr, 'audio')).toBe('audio instructions');
expect(renderInstructions(instr, 'text')).toBe('text instructions');
});

it('concatenates two Instructions, propagating both variants', () => {
const a = new Instructions({ audio: 'audio A', text: 'text A' });
const b = new Instructions({ audio: 'audio B', text: 'text B' });
const result = a.concat(b);
expect(result).toBeInstanceOf(Instructions);
expect(result.audio).toBe('audio Aaudio B');
expect(result.text).toBe('text Atext B');
});

it('concatenates Instructions + string, propagating both variants', () => {
const instr = new Instructions({ audio: 'audio', text: 'text' });
const result = instr.concat(' suffix');
expect(result.audio).toBe('audio suffix');
expect(result.text).toBe('text suffix');
});

it('concatInstructions handles string + Instructions (radd-style)', () => {
const instr = new Instructions({ audio: 'audio', text: 'text' });
const result = concatInstructions('prefix ', instr);
expect(isInstructions(result)).toBe(true);
if (!isInstructions(result)) return;
expect(result.audio).toBe('prefix audio');
expect(result.text).toBe('prefix text');
});

it('preserves text=undefined when concatenating an audio-only instructions', () => {
const audioOnly = new Instructions({ audio: 'audio only' });
const result = audioOnly.concat(' more');
expect(result.toJSON()).toEqual({ type: 'instructions', audio: 'audio only more' });
expect(result.audio).toBe('audio only more');
expect(result.text).toBe('audio only more');
});

it('when only one side has a text variant, the other contributes its audio', () => {
const a = new Instructions({ audio: 'audio A', text: 'text A' });
const b = new Instructions({ audio: 'audio B' });
const result = concatInstructions(a, ' ', b);
expect(isInstructions(result)).toBe(true);
if (!isInstructions(result)) return;
expect(result.audio).toBe('audio A audio B');
expect(result.text).toBe('text A audio B');
});

it('asModality returns a copy with both variants preserved', () => {
const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });

let resolved = instr.asModality('audio');
expect(resolved.value).toBe('audio instructions');
expect(resolved.audio).toBe('audio instructions');
expect(resolved.text).toBe('text instructions');

resolved = instr.asModality('text');
expect(resolved.value).toBe('text instructions');
expect(resolved.audio).toBe('audio instructions');
expect(resolved.text).toBe('text instructions');
});

it('can switch modality after a previous resolution', () => {
const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
const resolvedText = instr.asModality('text');
const resolvedAudio = resolvedText.asModality('audio');
expect(resolvedAudio.value).toBe('audio instructions');
});

it('asModality on audio-only Instructions returns audio for both modalities', () => {
const audioOnly = new Instructions({ audio: 'audio only' });
expect(audioOnly.asModality('audio').value).toBe('audio only');
expect(audioOnly.asModality('text').value).toBe('audio only');
});

it('applyInstructionsModality rewrites the system message content', () => {
const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
const ctx = new ChatContext([
ChatMessage.create({
id: INSTRUCTIONS_MESSAGE_ID,
role: 'system',
content: [instr],
}),
]);

applyInstructionsModality(ctx, { modality: 'audio' });
let content = (ctx.items[0]! as ChatMessage).content[0]!;
expect(isInstructions(content) ? content.value : '').toBe('audio instructions');

applyInstructionsModality(ctx, { modality: 'text' });
content = (ctx.items[0]! as ChatMessage).content[0]!;
expect(isInstructions(content) ? content.value : '').toBe('text instructions');
});

it('applyInstructionsModality is a no-op when content has no Instructions', () => {
const ctx = new ChatContext([
ChatMessage.create({
id: INSTRUCTIONS_MESSAGE_ID,
role: 'system',
content: ['plain string instructions'],
}),
]);
const before = (ctx.items[0]! as ChatMessage).content[0];
applyInstructionsModality(ctx, { modality: 'text' });
expect((ctx.items[0]! as ChatMessage).content[0]).toBe(before);
});

it('survives copy and lets a different modality be applied to the copy', () => {
const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
const baseCtx = new ChatContext([
ChatMessage.create({
id: INSTRUCTIONS_MESSAGE_ID,
role: 'system',
content: [instr],
}),
]);
const turn1 = baseCtx.copy();
applyInstructionsModality(turn1, { modality: 'text' });
const turn2 = turn1.copy();
applyInstructionsModality(turn2, { modality: 'audio' });

const turn2Content = (turn2.items[0]! as ChatMessage).content[0]!;
expect(isInstructions(turn2Content) ? turn2Content.value : '').toBe('audio instructions');

// base context content is untouched (was the original instr)
expect((baseCtx.items[0]! as ChatMessage).content[0]).toBe(instr);
});
});
Loading
Loading