diff --git a/.changeset/modality-aware-instructions.md b/.changeset/modality-aware-instructions.md
new file mode 100644
index 000000000..17de7df2a
--- /dev/null
+++ b/.changeset/modality-aware-instructions.md
@@ -0,0 +1,12 @@
+---
+'@livekit/agents': patch
+---
+
+feat(agents): add modality-aware `Instructions` with audio/text variants
+
+Introduce a new `Instructions` class for system prompts that adapt to the
+user's input modality. The pipeline now applies the matching variant before
+each LLM turn based on `SpeechHandle.inputDetails.modality`, and
+`AgentSession.generateReply()` and `AgentSession.run()` expose an
+`inputModality` option. `Instructions.tpl` supports JS-native prompt
+composition while preserving audio/text variants.
diff --git a/agents/src/llm/chat_context.test.ts b/agents/src/llm/chat_context.test.ts
index 21f93620a..849469e55 100644
--- a/agents/src/llm/chat_context.test.ts
+++ b/agents/src/llm/chat_context.test.ts
@@ -3,6 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 import { describe, expect, it } from 'vitest';
 import { initializeLogger } from '../log.js';
+import { INSTRUCTIONS_MESSAGE_ID, applyInstructionsModality } from '../voice/generation.js';
 import { FakeLLM } from '../voice/testing/fake_llm.js';
 import {
   type AudioContent,
@@ -12,7 +13,11 @@ import {
   FunctionCall,
   FunctionCallOutput,
   type ImageContent,
+  Instructions,
   ReadonlyChatContext,
+  concatInstructions,
+  isInstructions,
+  renderInstructions,
 } from './chat_context.js';
 
 initializeLogger({ pretty: false, level: 'error' });
@@ -434,7 +439,7 @@ describe('ChatContext._summarize', () => {
         item.extra?.is_summary !== true,
     );
     expect(rawTailMessages).toHaveLength(4);
-    expect(rawTailMessages.map((item) => item.textContent)).toEqual([
+    expect(rawTailMessages.map((item) => (item as ChatMessage).textContent)).toEqual([
       'Order #123',
       'Found your order. Let me check the warranty.',
       'Thanks.',
@@ -1235,3 +1240,242 @@ describe('ChatContext.isEquivalent', () => {
     });
   });
 });
+
+describe('Instructions', () => {
+  it('constructs from an object with audio and text variants', () => {
+    const instr = new Instructions({ audio: 'audio variant', text: 'text variant' });
+
+    expect(instr.audio).toBe('audio variant');
+    expect(instr.text).toBe('text variant');
+    expect(instr.value).toBe('audio variant');
+  });
+
+  it('identifies Instructions with a type guard', () => {
+    const instr = new Instructions({ audio: 'audio variant', text: 'text variant' });
+
+    expect(isInstructions(instr)).toBe(true);
+    expect(isInstructions('audio variant')).toBe(false);
+    expect(isInstructions({ type: 'instructions', audio: 'audio variant' })).toBe(false);
+  });
+
+  it('tpl propagates Instructions interpolations into audio and text variants', () => {
+    const instr = Instructions.tpl`persona
+${new Instructions({ audio: 'audio rules', text: 'text rules' })}
+extra`;
+
+    expect(instr).toBeInstanceOf(Instructions);
+    expect(instr.audio).toBe('persona\naudio rules\nextra');
+    expect(instr.text).toBe('persona\ntext rules\nextra');
+    expect(instr.value).toBe('persona\naudio rules\nextra');
+    expect(instr.asModality('text').value).toBe('persona\ntext rules\nextra');
+  });
+
+  it('tpl preserves audio-only interpolation as audio-only output', () => {
+    const instr = Instructions.tpl`prefix ${new Instructions({ audio: 'same' })} suffix`;
+
+    expect(instr.toJSON()).toEqual({ type: 'instructions', audio: 'prefix same suffix' });
+    expect(instr.audio).toBe('prefix same suffix');
+    expect(instr.text).toBe('prefix same suffix');
+  });
+
+  it('tpl interpolates primitive values into both variants', () => {
+    const instr = Instructions.tpl`date=${'2026-05-13'} enabled=${true} count=${3}`;
+
+    expect(instr.toJSON()).toEqual({
+      type: 'instructions',
+      audio: 'date=2026-05-13 enabled=true count=3',
+    });
+    expect(instr.audio).toBe('date=2026-05-13 enabled=true count=3');
+    expect(instr.text).toBe('date=2026-05-13 enabled=true count=3');
+    expect(instr.value).toBe('date=2026-05-13 enabled=true count=3');
+  });
+
+  it('tpl combines multiple modality-aware interpolations', () => {
+    const instr = Instructions.tpl`${new Instructions({ audio: 'audio A', text: 'text A' })} / ${new Instructions({ audio: 'audio B', text: 'text B' })}`;
+
+    expect(instr.audio).toBe('audio A / audio B');
+    expect(instr.text).toBe('text A / text B');
+    expect(instr.value).toBe('audio A / audio B');
+  });
+
+  it('tpl preserves the current rendered value of resolved interpolations', () => {
+    const resolved = new Instructions({ audio: 'audio rules', text: 'text rules' }).asModality(
+      'text',
+    );
+    const instr = Instructions.tpl`prefix ${resolved} suffix`;
+
+    expect(instr.audio).toBe('prefix audio rules suffix');
+    expect(instr.text).toBe('prefix text rules suffix');
+    expect(instr.value).toBe('prefix text rules suffix');
+  });
+
+  it('tpl stringifies null and undefined values like template literals', () => {
+    const instr = Instructions.tpl`null=${null} undefined=${undefined}`;
+
+    expect(instr.toJSON()).toEqual({
+      type: 'instructions',
+      audio: 'null=null undefined=undefined',
+    });
+    expect(instr.audio).toBe('null=null undefined=undefined');
+    expect(instr.text).toBe('null=null undefined=undefined');
+  });
+
+  it('serializes to a dict with both variants and round-trips through toJSON', () => {
+    const instr = new Instructions({ audio: 'audio variant', text: 'text variant' });
+
+    const ctx = new ChatContext([ChatMessage.create({ role: 'system', content: [instr] })]);
+    const data = ctx.toJSON();
+    const items = (data.items as Record<string, unknown>[])!;
+    const content = (items[0]!.content as Record<string, unknown>[])![0]!;
+
+    expect(content).toEqual({
+      type: 'instructions',
+      audio: 'audio variant',
+      text: 'text variant',
+    });
+  });
+
+  it('omits the text key in toJSON when only audio variant is provided', () => {
+    const instr = new Instructions({ audio: 'audio only' });
+    expect(instr.toJSON()).toEqual({ type: 'instructions', audio: 'audio only' });
+  });
+
+  it('falls back text -> audio when no text variant is provided', () => {
+    const instr = new Instructions({ audio: 'audio only' });
+    expect(instr.audio).toBe('audio only');
+    expect(instr.text).toBe('audio only');
+    expect(instr.value).toBe('audio only');
+  });
+
+  it('renderInstructions returns strings and resolved Instructions values explicitly', () => {
+    const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
+
+    expect(renderInstructions('plain instructions')).toBe('plain instructions');
+    expect(renderInstructions(instr)).toBe('audio instructions');
+    expect(renderInstructions(instr, 'audio')).toBe('audio instructions');
+    expect(renderInstructions(instr, 'text')).toBe('text instructions');
+  });
+
+  it('concatenates two Instructions, propagating both variants', () => {
+    const a = new Instructions({ audio: 'audio A', text: 'text A' });
+    const b = new Instructions({ audio: 'audio B', text: 'text B' });
+    const result = a.concat(b);
+    expect(result).toBeInstanceOf(Instructions);
+    expect(result.audio).toBe('audio Aaudio B');
+    expect(result.text).toBe('text Atext B');
+  });
+
+  it('concatenates Instructions + string, propagating both variants', () => {
+    const instr = new Instructions({ audio: 'audio', text: 'text' });
+    const result = instr.concat(' suffix');
+    expect(result.audio).toBe('audio suffix');
+    expect(result.text).toBe('text suffix');
+  });
+
+  it('concatInstructions handles string + Instructions (radd-style)', () => {
+    const instr = new Instructions({ audio: 'audio', text: 'text' });
+    const result = concatInstructions('prefix ', instr);
+    expect(isInstructions(result)).toBe(true);
+    if (!isInstructions(result)) return;
+    expect(result.audio).toBe('prefix audio');
+    expect(result.text).toBe('prefix text');
+  });
+
+  it('preserves text=undefined when concatenating an audio-only instructions', () => {
+    const audioOnly = new Instructions({ audio: 'audio only' });
+    const result = audioOnly.concat(' more');
+    expect(result.toJSON()).toEqual({ type: 'instructions', audio: 'audio only more' });
+    expect(result.audio).toBe('audio only more');
+    expect(result.text).toBe('audio only more');
+  });
+
+  it('when only one side has a text variant, the other contributes its audio', () => {
+    const a = new Instructions({ audio: 'audio A', text: 'text A' });
+    const b = new Instructions({ audio: 'audio B' });
+    const result = concatInstructions(a, ' ', b);
+    expect(isInstructions(result)).toBe(true);
+    if (!isInstructions(result)) return;
+    expect(result.audio).toBe('audio A audio B');
+    expect(result.text).toBe('text A audio B');
+  });
+
+  it('asModality returns a copy with both variants preserved', () => {
+    const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
+
+    let resolved = instr.asModality('audio');
+    expect(resolved.value).toBe('audio instructions');
+    expect(resolved.audio).toBe('audio instructions');
+    expect(resolved.text).toBe('text instructions');
+
+    resolved = instr.asModality('text');
+    expect(resolved.value).toBe('text instructions');
+    expect(resolved.audio).toBe('audio instructions');
+    expect(resolved.text).toBe('text instructions');
+  });
+
+  it('can switch modality after a previous resolution', () => {
+    const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
+    const resolvedText = instr.asModality('text');
+    const resolvedAudio = resolvedText.asModality('audio');
+    expect(resolvedAudio.value).toBe('audio instructions');
+  });
+
+  it('asModality on audio-only Instructions returns audio for both modalities', () => {
+    const audioOnly = new Instructions({ audio: 'audio only' });
+    expect(audioOnly.asModality('audio').value).toBe('audio only');
+    expect(audioOnly.asModality('text').value).toBe('audio only');
+  });
+
+  it('applyInstructionsModality rewrites the system message content', () => {
+    const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
+    const ctx = new ChatContext([
+      ChatMessage.create({
+        id: INSTRUCTIONS_MESSAGE_ID,
+        role: 'system',
+        content: [instr],
+      }),
+    ]);
+
+    applyInstructionsModality(ctx, { modality: 'audio' });
+    let content = (ctx.items[0]! as ChatMessage).content[0]!;
+    expect(isInstructions(content) ? content.value : '').toBe('audio instructions');
+
+    applyInstructionsModality(ctx, { modality: 'text' });
+    content = (ctx.items[0]! as ChatMessage).content[0]!;
+    expect(isInstructions(content) ? content.value : '').toBe('text instructions');
+  });
+
+  it('applyInstructionsModality is a no-op when content has no Instructions', () => {
+    const ctx = new ChatContext([
+      ChatMessage.create({
+        id: INSTRUCTIONS_MESSAGE_ID,
+        role: 'system',
+        content: ['plain string instructions'],
+      }),
+    ]);
+    const before = (ctx.items[0]! as ChatMessage).content[0];
+    applyInstructionsModality(ctx, { modality: 'text' });
+    expect((ctx.items[0]! as ChatMessage).content[0]).toBe(before);
+  });
+
+  it('survives copy and lets a different modality be applied to the copy', () => {
+    const instr = new Instructions({ audio: 'audio instructions', text: 'text instructions' });
+    const baseCtx = new ChatContext([
+      ChatMessage.create({
+        id: INSTRUCTIONS_MESSAGE_ID,
+        role: 'system',
+        content: [instr],
+      }),
+    ]);
+    const turn1 = baseCtx.copy();
+    applyInstructionsModality(turn1, { modality: 'text' });
+    const turn2 = turn1.copy();
+    applyInstructionsModality(turn2, { modality: 'audio' });
+
+    const turn2Content = (turn2.items[0]! as ChatMessage).content[0]!;
+    expect(isInstructions(turn2Content) ? turn2Content.value : '').toBe('audio instructions');
+
+    // base context content is untouched (was the original instr)
+    expect((baseCtx.items[0]! as ChatMessage).content[0]).toBe(instr);
+  });
+});
diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts
index 996cd12f2..743e7efb8 100644
--- a/agents/src/llm/chat_context.ts
+++ b/agents/src/llm/chat_context.ts
@@ -37,7 +37,197 @@ export interface AudioContent {
   transcript?: string;
 }
 
-export type ChatContent = ImageContent | AudioContent | string;
+type InstructionsOptions = {
+  /** The audio/voice variant of the instructions. */
+  audio: string;
+  /** The text variant of the instructions; falls back to `audio` when omitted. */
+  text?: string;
+  /** The currently rendered string value, used by `value`/`toString()`. */
+  represent?: string;
+};
+
+const INSTRUCTIONS_SYMBOL = Symbol.for('livekit.agents.Instructions');
+
+export function isInstructions(value: unknown): value is Instructions {
+  return (
+    typeof value === 'object' &&
+    value !== null &&
+    INSTRUCTIONS_SYMBOL in value &&
+    (value as Record<symbol, boolean>)[INSTRUCTIONS_SYMBOL] === true
+  );
+}
+
+/**
+ * Instructions that adapt based on the user's input modality (audio vs. text).
+ *
+ * The `value` property is the rendered string providers see. By default it
+ * equals the `audio` variant; after {@link asModality} it equals the chosen
+ * variant. Both the `audio` variant and the raw `text` variant are preserved
+ * so {@link asModality} can be called again for a different modality (e.g.,
+ * when the same `ChatContext` is reused across tool-call turns).
+ */
+export class Instructions {
+  readonly type = 'instructions' as const;
+
+  private readonly _audioVariant: string;
+
+  /** Raw text variant; falls back to {@link audio} when omitted. */
+  private readonly _textVariant?: string;
+
+  /** The currently rendered string (what providers should treat as content). */
+  readonly value: string;
+
+  /** @internal Symbol marker for type identification */
+  readonly [INSTRUCTIONS_SYMBOL] = true;
+
+  constructor(options: InstructionsOptions) {
+    this._audioVariant = options.audio;
+    this._textVariant = options.text;
+    this.value = options.represent ?? options.audio;
+  }
+
+  static tpl(
+    strings: TemplateStringsArray,
+    ...values: Array<Instructions | string | number | boolean | null | undefined>
+  ): Instructions {
+    const render = (mode: 'audio' | 'text' | 'value') => {
+      let result = strings[0]!;
+      for (let i = 0; i < values.length; i++) {
+        const value = values[i]!;
+        if (isInstructions(value)) {
+          result += mode === 'audio' ? value.audio : mode === 'text' ? value.text : value.value;
+        } else {
+          result += String(value);
+        }
+        result += strings[i + 1]!;
+      }
+      return result;
+    };
+
+    const hasTextVariant = values.some(
+      (value) => isInstructions(value) && value._textVariant !== undefined,
+    );
+
+    return new Instructions({
+      audio: render('audio'),
+      text: hasTextVariant ? render('text') : undefined,
+      represent: render('value'),
+    });
+  }
+
+  /** The audio (voice) variant of the instructions. */
+  get audio(): string {
+    return this._audioVariant;
+  }
+
+  /** The text variant of the instructions. Falls back to {@link audio}. */
+  get text(): string {
+    return this._textVariant ?? this.audio;
+  }
+
+  /**
+   * Return a copy whose {@link value} is the variant matching `modality`.
+   * Both `audio` and `text` variants are preserved on the result, so this can
+   * be called again for a different modality (e.g. across tool-call turns).
+   */
+  asModality(modality: 'audio' | 'text'): Instructions {
+    return new Instructions({
+      audio: this.audio,
+      text: this._textVariant,
+      represent: modality === 'audio' ? this.audio : this.text,
+    });
+  }
+
+  /** Concatenate, propagating both variants and the current rendered value. */
+  concat(other: string | Instructions): Instructions {
+    if (isInstructions(other)) {
+      const hasText = this._textVariant !== undefined || other._textVariant !== undefined;
+      return new Instructions({
+        audio: this.audio + other.audio,
+        text: hasText ? this.text + other.text : undefined,
+        represent: this.value + other.value,
+      });
+    }
+    return new Instructions({
+      audio: this.audio + other,
+      text: this._textVariant !== undefined ? this._textVariant + other : undefined,
+      represent: this.value + other,
+    });
+  }
+
+  toString(): string {
+    return this.value;
+  }
+
+  toJSON(): { type: 'instructions'; audio: string; text?: string } {
+    const result: { type: 'instructions'; audio: string; text?: string } = {
+      type: 'instructions',
+      audio: this.audio,
+    };
+    if (this._textVariant !== undefined) {
+      result.text = this._textVariant;
+    }
+    return result;
+  }
+}
+
+export function renderInstructions(
+  instructions: string | Instructions,
+  modality?: 'audio' | 'text',
+): string {
+  if (typeof instructions === 'string') return instructions;
+  return modality === undefined ? instructions.value : instructions.asModality(modality).value;
+}
+
+/**
+ * Compare two instruction values by content. Plain strings compare by value;
+ * {@link Instructions} compare by their audio + text variants so that two
+ * distinct instances with the same content are treated as equal.
+ */
+export function instructionsEqual(
+  a: string | Instructions | undefined,
+  b: string | Instructions | undefined,
+): boolean {
+  if (a === b) return true;
+  if (a === undefined || b === undefined) return false;
+  const aIsInstr = isInstructions(a);
+  const bIsInstr = isInstructions(b);
+  if (aIsInstr && bIsInstr) {
+    return a.audio === b.audio && a.text === b.text;
+  }
+  if (!aIsInstr && !bIsInstr) {
+    return a === b;
+  }
+  return false;
+}
+
+/**
+ * Concatenate any mix of plain strings and {@link Instructions}, propagating
+ * both audio/text variants. If no argument is an {@link Instructions} the
+ * result is a plain string; otherwise the result is an {@link Instructions}
+ * preserving both variants from every contributing operand.
+ */
+export function concatInstructions(...parts: Array<string | Instructions>): string | Instructions {
+  if (parts.length === 0) return '';
+  const hasInstructions = parts.some((p) => isInstructions(p));
+  if (!hasInstructions) return parts.join('');
+
+  let acc = parts[0]!;
+  for (let i = 1; i < parts.length; i++) {
+    const next = parts[i]!;
+    if (isInstructions(acc)) {
+      acc = acc.concat(next);
+    } else if (isInstructions(next)) {
+      // string + Instructions (radd-style): prepend `acc` to both variants.
+      acc = new Instructions({ audio: acc }).concat(next);
+    } else {
+      acc = acc + next;
+    }
+  }
+  return acc;
+}
+
+export type ChatContent = ImageContent | AudioContent | Instructions | string;
 
 export function createImageContent(params: {
   image: string | VideoFrame;
@@ -171,7 +361,9 @@ export class ChatMessage {
    * lines. If no string content is present, returns `null`.
    */
   get textContent(): string | undefined {
-    const parts = this.content.filter((c): c is string => typeof c === 'string');
+    const parts = this.content
+      .filter((c): c is string | Instructions => typeof c === 'string' || isInstructions(c))
+      .map((c) => (typeof c === 'string' ? c : c.value));
     return parts.length > 0 ? parts.join('\n') : undefined;
   }
 
@@ -179,6 +371,8 @@ export class ChatMessage {
     return this.content.map((c) => {
       if (typeof c === 'string') {
         return c as JSONValue;
+      } else if (isInstructions(c)) {
+        return c.toJSON() as JSONValue;
       } else if (c.type === 'image_content') {
         return {
           id: c.id,
@@ -456,7 +650,7 @@ export class AgentConfigUpdate {
 
   readonly type = 'agent_config_update' as const;
 
-  instructions?: string;
+  instructions?: string | Instructions;
 
   toolsAdded?: string[];
 
@@ -467,7 +661,7 @@ export class AgentConfigUpdate {
   constructor(
     params: {
       id?: string;
-      instructions?: string;
+      instructions?: string | Instructions;
       toolsAdded?: string[];
       toolsRemoved?: string[];
       createdAt?: number;
@@ -489,7 +683,7 @@ export class AgentConfigUpdate {
 
   static create(params: {
     id?: string;
-    instructions?: string;
+    instructions?: string | Instructions;
     toolsAdded?: string[];
     toolsRemoved?: string[];
     createdAt?: number;
@@ -504,7 +698,9 @@ export class AgentConfigUpdate {
     };
 
     if (this.instructions !== undefined) {
-      result.instructions = this.instructions;
+      result.instructions = isInstructions(this.instructions)
+        ? (this.instructions.toJSON() as JSONValue)
+        : this.instructions;
     }
     if (this.toolsAdded !== undefined) {
       result.toolsAdded = this.toolsAdded;
@@ -891,6 +1087,21 @@ export class ChatContext {
         return false;
       }
 
+      if (isInstructions(contentA) && isInstructions(contentB)) {
+        if (
+          contentA.audio !== contentB.audio ||
+          contentA.text !== contentB.text ||
+          contentA.value !== contentB.value
+        ) {
+          return false;
+        }
+        continue;
+      }
+
+      if (isInstructions(contentA) || isInstructions(contentB)) {
+        return false;
+      }
+
       if (typeof contentA === 'object' && typeof contentB === 'object') {
         if (contentA.type === 'image_content' && contentB.type === 'image_content') {
           if (
diff --git a/agents/src/llm/index.ts b/agents/src/llm/index.ts
index ae6f75b9c..950296c9c 100644
--- a/agents/src/llm/index.ts
+++ b/agents/src/llm/index.ts
@@ -22,6 +22,8 @@ export {
   AgentConfigUpdate,
   ChatContext,
   ChatMessage,
+  Instructions,
+  concatInstructions,
   createAudioContent,
   createImageContent,
   FunctionCall,
diff --git a/agents/src/llm/provider_format/google.test.ts b/agents/src/llm/provider_format/google.test.ts
index a86d8460b..c59e52045 100644
--- a/agents/src/llm/provider_format/google.test.ts
+++ b/agents/src/llm/provider_format/google.test.ts
@@ -9,6 +9,7 @@ import {
   ChatContext,
   FunctionCall,
   FunctionCallOutput,
+  Instructions,
 } from '../chat_context.js';
 import { serializeImage } from '../utils.js';
 import { toChatCtx } from './google.js';
@@ -62,6 +63,23 @@ describe('Google Provider Format - toChatCtx', () => {
     expect(formatData.systemMessages).toEqual(['You are a helpful assistant']);
   });
 
+  it('should render Instructions as their resolved value', async () => {
+    const ctx = ChatContext.empty();
+    ctx.addMessage({
+      role: 'system',
+      content: [
+        new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality(
+          'text',
+        ),
+      ],
+    });
+    ctx.addMessage({ role: 'user', content: 'Hello' });
+
+    const [, formatData] = await toChatCtx(ctx, false);
+
+    expect(formatData.systemMessages).toEqual(['text instructions']);
+  });
+
   it('should handle multiple system messages', async () => {
     const ctx = ChatContext.empty();
     ctx.addMessage({ role: 'system', content: 'You are a helpful assistant' });
diff --git a/agents/src/llm/provider_format/google.ts b/agents/src/llm/provider_format/google.ts
index 45c68295d..fbfdc4d01 100644
--- a/agents/src/llm/provider_format/google.ts
+++ b/agents/src/llm/provider_format/google.ts
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import type { ChatContext, ChatItem, ImageContent } from '../chat_context.js';
+import { isInstructions } from '../chat_context.js';
 import { type SerializedImage, serializeImage } from '../utils.js';
 import { groupToolCalls } from './utils.js';
 
@@ -57,6 +58,8 @@ export async function toChatCtx(
       for (const content of msg.content) {
         if (content && typeof content === 'string') {
           parts.push({ text: content });
+        } else if (isInstructions(content)) {
+          parts.push({ text: content.value });
         } else if (content && typeof content === 'object') {
           if (content.type === 'image_content') {
             parts.push(await toImagePart(content));
diff --git a/agents/src/llm/provider_format/mistralai.test.ts b/agents/src/llm/provider_format/mistralai.test.ts
index 098195357..db7387637 100644
--- a/agents/src/llm/provider_format/mistralai.test.ts
+++ b/agents/src/llm/provider_format/mistralai.test.ts
@@ -8,6 +8,7 @@ import {
   ChatContext,
   FunctionCall,
   FunctionCallOutput,
+  Instructions,
 } from '../chat_context.js';
 import { toChatCtx } from './mistralai.js';
 
@@ -39,6 +40,23 @@ describe('Mistral Provider Format - toChatCtx', () => {
     expect(formatData.instructions).toBe('You are a helpful assistant');
   });
 
+  it('should render Instructions as their resolved value', () => {
+    const ctx = ChatContext.empty();
+    ctx.addMessage({
+      role: 'system',
+      content: [
+        new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality(
+          'text',
+        ),
+      ],
+    });
+    ctx.addMessage({ role: 'user', content: 'Hello' });
+
+    const [, formatData] = toChatCtx(ctx);
+
+    expect(formatData.instructions).toBe('text instructions');
+  });
+
   it('should extract developer messages as instructions', () => {
     const ctx = ChatContext.empty();
     ctx.addMessage({ role: 'developer', content: 'Be concise' });
diff --git a/agents/src/llm/provider_format/mistralai.ts b/agents/src/llm/provider_format/mistralai.ts
index 15234a5cd..776be3085 100644
--- a/agents/src/llm/provider_format/mistralai.ts
+++ b/agents/src/llm/provider_format/mistralai.ts
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import type { ChatContext } from '../chat_context.js';
+import { isInstructions } from '../chat_context.js';
 
 export interface MistralFormatData {
   instructions: string;
@@ -27,7 +28,10 @@ export function toChatCtx(
 
   for (const item of chatCtx.items) {
     if (item.type === 'message') {
-      const text = item.content.filter((c): c is string => typeof c === 'string').join('\n');
+      const text = item.content
+        .filter((c) => typeof c === 'string' || isInstructions(c))
+        .map((c) => (typeof c === 'string' ? c : c.value))
+        .join('\n');
 
       if (item.role === 'system' || item.role === 'developer') {
         instructionParts.push(text);
diff --git a/agents/src/llm/provider_format/openai.test.ts b/agents/src/llm/provider_format/openai.test.ts
index 97cb5dcca..6a2a8aed9 100644
--- a/agents/src/llm/provider_format/openai.test.ts
+++ b/agents/src/llm/provider_format/openai.test.ts
@@ -9,6 +9,7 @@ import {
   ChatContext,
   FunctionCall,
   FunctionCallOutput,
+  Instructions,
 } from '../chat_context.js';
 import { serializeImage } from '../utils.js';
 import { toChatCtx, toResponsesChatCtx } from './openai.js';
@@ -54,6 +55,23 @@ describe('toChatCtx', () => {
     expect(result[1]).toEqual({ role: 'user', content: 'Hello' });
   });
 
+  it('should render Instructions as their resolved value', async () => {
+    const ctx = ChatContext.empty();
+    ctx.addMessage({
+      role: 'system',
+      content: [
+        new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality(
+          'text',
+        ),
+      ],
+    });
+    ctx.addMessage({ role: 'user', content: 'Hello' });
+
+    const result = await toChatCtx(ctx);
+
+    expect(result[0]).toEqual({ role: 'system', content: 'text instructions' });
+  });
+
   it('should handle multi-line text content', async () => {
     const ctx = ChatContext.empty();
     ctx.addMessage({ role: 'user', content: ['Line 1', 'Line 2', 'Line 3'] });
@@ -707,6 +725,23 @@ describe('toResponsesChatCtx', () => {
     expect(result[1]).toEqual({ role: 'user', content: 'Hello' });
   });
 
+  it('should render Instructions as their resolved value', async () => {
+    const ctx = ChatContext.empty();
+    ctx.addMessage({
+      role: 'system',
+      content: [
+        new Instructions({ audio: 'audio instructions', text: 'text instructions' }).asModality(
+          'text',
+        ),
+      ],
+    });
+    ctx.addMessage({ role: 'user', content: 'Hello' });
+
+    const result = await toResponsesChatCtx(ctx);
+
+    expect(result[0]).toEqual({ role: 'system', content: 'text instructions' });
+  });
+
   it('should handle multi-line text content', async () => {
     const ctx = ChatContext.empty();
     ctx.addMessage({ role: 'user', content: ['Line 1', 'Line 2', 'Line 3'] });
diff --git a/agents/src/llm/provider_format/openai.ts b/agents/src/llm/provider_format/openai.ts
index 9d61e169d..9a67c0f31 100644
--- a/agents/src/llm/provider_format/openai.ts
+++ b/agents/src/llm/provider_format/openai.ts
@@ -2,6 +2,7 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 import type { ChatContext, ChatItem, ImageContent } from '../chat_context.js';
+import { isInstructions } from '../chat_context.js';
 import { type SerializedImage, serializeImage } from '../utils.js';
 import { groupToolCalls } from './utils.js';
 
@@ -69,6 +70,9 @@ async function toChatItem(item: ChatItem) {
       if (typeof content === 'string') {
         if (textContent) textContent += '\n';
         textContent += content;
+      } else if (isInstructions(content)) {
+        if (textContent) textContent += '\n';
+        textContent += content.value;
       } else if (content.type === 'image_content') {
         listContent.push(await toImageContent(content));
       } else {
@@ -229,6 +233,9 @@ async function toResponsesChatItem(item: ChatItem) {
       if (typeof content === 'string') {
         if (textContent) textContent += '\n';
         textContent += content;
+      } else if (isInstructions(content)) {
+        if (textContent) textContent += '\n';
+        textContent += content.value;
       } else if (content.type === 'image_content') {
         listContent.push(await toResponsesImageContent(content));
       } else {
diff --git a/agents/src/llm/utils.ts b/agents/src/llm/utils.ts
index c5dbcd40c..0271deb2a 100644
--- a/agents/src/llm/utils.ts
+++ b/agents/src/llm/utils.ts
@@ -364,6 +364,10 @@ export function validateChatContextStructure(chatCtx: ChatContext): ChatContextV
           return;
         }
 
+        if (term.type === 'instructions') {
+          return;
+        }
+
         if (term.type === 'image_content') {
           if (!term.id || term.image === undefined || term.image === null) {
             pushIssue({
@@ -514,6 +518,10 @@ function formatMessageContentPart(part: ChatContent): string {
     return part;
   }
 
+  if (part.type === 'instructions') {
+    return part.value;
+  }
+
   if (part.type === 'image_content') {
     if (typeof part.image === 'string') {
       return `[image url=${truncateText(part.image, 120)}]`;
diff --git a/agents/src/telemetry/traces.ts b/agents/src/telemetry/traces.ts
index e2e4a35b9..166e4ff24 100644
--- a/agents/src/telemetry/traces.ts
+++ b/agents/src/telemetry/traces.ts
@@ -23,6 +23,7 @@ import { ATTR_SERVICE_NAME } from '@opentelemetry/semantic-conventions';
 import FormData from 'form-data';
 import { AccessToken } from 'livekit-server-sdk';
 import fs from 'node:fs/promises';
+import { isInstructions, renderInstructions } from '../llm/chat_context.js';
 import type { ChatContent, ChatItem, ChatRole } from '../llm/index.js';
 import { enableOtelLogging } from '../log.js';
 import { filterZeroValues } from '../metrics/model_usage.js';
@@ -373,7 +374,9 @@ function chatItemToProto(item: ChatItem): ProtoChatItem {
     const msg: ProtoMessage = {
       id: item.id,
       role: ROLE_MAP[item.role] ?? (item.role.toUpperCase() as ProtoRole),
-      content: item.content.map((c: ChatContent) => ({ text: c })),
+      content: item.content.map((c: ChatContent) => ({
+        text: isInstructions(c) ? c.value : c,
+      })),
       createdAt: toRFC3339(item.createdAt),
     };
 
@@ -456,7 +459,7 @@ function chatItemToProto(item: ChatItem): ProtoChatItem {
       createdAt: toRFC3339(item.createdAt),
     };
     if (item.instructions !== undefined) {
-      configUpdate.instructions = item.instructions;
+      configUpdate.instructions = renderInstructions(item.instructions);
     }
     if (item.toolsAdded !== undefined) {
       configUpdate.toolsAdded = item.toolsAdded;
diff --git a/agents/src/voice/agent.ts b/agents/src/voice/agent.ts
index 876170636..890d7ea7d 100644
--- a/agents/src/voice/agent.ts
+++ b/agents/src/voice/agent.ts
@@ -12,7 +12,7 @@ import {
   type STTModelString,
   type TTSModelString,
 } from '../inference/index.js';
-import { ReadonlyChatContext } from '../llm/chat_context.js';
+import { type Instructions, ReadonlyChatContext } from '../llm/chat_context.js';
 import type { ChatMessage, FunctionCall } from '../llm/index.js';
 import {
   type ChatChunk,
@@ -117,7 +117,7 @@ export type TTSPronunciationMap = Record<string, string>;
 
 export interface AgentOptions<UserData> {
   id?: string;
-  instructions: string;
+  instructions: string | Instructions;
   chatCtx?: ChatContext;
   tools?: ToolContext<UserData>;
   stt?: STT | STTModelString;
@@ -154,7 +154,7 @@ export class Agent<UserData = any> {
   _chatCtx: ChatContext;
 
   /** @internal */
-  _instructions: string;
+  _instructions: string | Instructions;
 
   /** @internal */
   _tools?: ToolContext<UserData>;
@@ -264,7 +264,7 @@ export class Agent<UserData = any> {
     return this._id;
   }
 
-  get instructions(): string {
+  get instructions(): string | Instructions {
     return this._instructions;
   }
 
diff --git a/agents/src/voice/agent_activity.ts b/agents/src/voice/agent_activity.ts
index d77197c19..e53f39b14 100644
--- a/agents/src/voice/agent_activity.ts
+++ b/agents/src/voice/agent_activity.ts
@@ -17,7 +17,11 @@ import {
   AgentConfigUpdate,
   type ChatContext,
   ChatMessage,
+  type Instructions,
   type MetricsReport,
+  concatInstructions,
+  instructionsEqual,
+  renderInstructions,
 } from '../llm/chat_context.js';
 import {
   type ChatItem,
@@ -94,6 +98,7 @@ import type { ToolExecutionOutput, ToolOutput, _TTSGenerationData } from './gene
 import {
   type _AudioOut,
   type _TextOut,
+  applyInstructionsModality,
   performAudioForwarding,
   performLLMInference,
   performTTSInference,
@@ -103,7 +108,7 @@ import {
   updateInstructions,
 } from './generation.js';
 import type { TimedString } from './io.js';
-import { SpeechHandle } from './speech_handle.js';
+import { type InputDetails, SpeechHandle } from './speech_handle.js';
 import { createEndpointing } from './turn_config/endpointing.js';
 import { setParticipantSpanAttributes } from './utils.js';
 
@@ -447,10 +452,12 @@ export class AgentActivity implements RecognitionHooks {
       // this means the content is the same as the previous session
       const capabilities = this.llm.capabilities;
       try {
-        await this.realtimeSession!._updateSession(
+        const realtimeInstructions =
           !rtReused || capabilities.midSessionInstructionsUpdate
-            ? this.agent.instructions
-            : undefined,
+            ? renderInstructions(this.agent.instructions)
+            : undefined;
+        await this.realtimeSession!._updateSession(
+          realtimeInstructions,
           !rtReused || capabilities.midSessionChatCtxUpdate ? this.agent.chatCtx : undefined,
           !rtReused || capabilities.midSessionToolsUpdate ? this.tools : undefined,
         );
@@ -597,7 +604,7 @@ export class AgentActivity implements RecognitionHooks {
         reusable =
           reusable &&
           (capabilities.midSessionInstructionsUpdate ||
-            this.agent.instructions === newActivity.agent.instructions);
+            instructionsEqual(this.agent.instructions, newActivity.agent.instructions));
 
         // tools update is supported or tools are the same
         reusable =
@@ -1406,6 +1413,7 @@ export class AgentActivity implements RecognitionHooks {
       userMessage,
       chatCtx,
       scheduleSpeech: false,
+      inputDetails: { modality: 'audio' },
     });
 
     this._preemptiveGeneration = {
@@ -1630,10 +1638,11 @@ export class AgentActivity implements RecognitionHooks {
   generateReply(options: {
     userMessage?: ChatMessage;
     chatCtx?: ChatContext;
-    instructions?: string;
+    instructions?: string | Instructions;
     toolChoice?: ToolChoice | null;
     allowInterruptions?: boolean;
     scheduleSpeech?: boolean;
+    inputDetails?: InputDetails;
   }): SpeechHandle {
     const {
       userMessage,
@@ -1642,9 +1651,10 @@ export class AgentActivity implements RecognitionHooks {
       toolChoice: defaultToolChoice,
       allowInterruptions: defaultAllowInterruptions,
       scheduleSpeech = true,
+      inputDetails,
     } = options;
 
-    let instructions = defaultInstructions;
+    let instructions: string | Instructions | undefined = defaultInstructions;
     let toolChoice = defaultToolChoice;
     let allowInterruptions = defaultAllowInterruptions;
 
@@ -1672,6 +1682,7 @@ export class AgentActivity implements RecognitionHooks {
 
     const handle = SpeechHandle.create({
       allowInterruptions: allowInterruptions ?? this.allowInterruptions,
+      inputDetails,
     });
 
     this.agentSession.emit(
@@ -1706,7 +1717,7 @@ export class AgentActivity implements RecognitionHooks {
       // this matches the behavior of the Realtime API:
       // https://platform.openai.com/docs/api-reference/realtime-client-events/response/create
       if (instructions) {
-        instructions = `${this.agent.instructions}\n${instructions}`;
+        instructions = concatInstructions(this.agent.instructions, '\n', instructions);
       }
 
       // Filter out tools with IGNORE_ON_ENTER flag when generateReply is called inside onEnter
@@ -1933,7 +1944,11 @@ export class AgentActivity implements RecognitionHooks {
     if (speechHandle === undefined) {
       // Ensure the new message is passed to generateReply
       // This preserves the original message id, making it easier for users to track responses
-      speechHandle = this.generateReply({ userMessage, chatCtx });
+      speechHandle = this.generateReply({
+        userMessage,
+        chatCtx,
+        inputDetails: { modality: 'audio' },
+      });
     }
 
     const eouMetrics: EOUMetrics = {
@@ -2140,7 +2155,7 @@ export class AgentActivity implements RecognitionHooks {
     toolCtx: ToolContext;
     modelSettings: ModelSettings;
     replyAbortController: AbortController;
-    instructions?: string;
+    instructions?: string | Instructions;
     newMessage?: ChatMessage;
     toolsMessages?: ChatItem[];
     span: Span;
@@ -2150,7 +2165,7 @@ export class AgentActivity implements RecognitionHooks {
 
     span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
     if (instructions) {
-      span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
+      span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, renderInstructions(instructions));
     }
     if (newMessage) {
       span.setAttribute(traceTypes.ATTR_USER_INPUT, newMessage.textContent || '');
@@ -2189,6 +2204,9 @@ export class AgentActivity implements RecognitionHooks {
       }
     }
 
+    // apply the correct variant of the instructions for the turn's input modality
+    applyInstructionsModality(chatCtx, { modality: speechHandle.inputDetails.modality });
+
     const tasks: Array<Task<void>> = [];
     const [llmTask, llmGenData] = performLLMInference(
       // preserve  `this` context in llmNode
@@ -2628,7 +2646,7 @@ export class AgentActivity implements RecognitionHooks {
     toolCtx: ToolContext,
     modelSettings: ModelSettings,
     replyAbortController: AbortController,
-    instructions?: string,
+    instructions?: string | Instructions,
     newMessage?: ChatMessage,
     toolsMessages?: ChatItem[],
     _previousUserMetrics?: MetricsReport,
@@ -3234,7 +3252,7 @@ export class AgentActivity implements RecognitionHooks {
     modelSettings: ModelSettings;
     abortController: AbortController;
     userInput?: string;
-    instructions?: string;
+    instructions?: string | Instructions;
   }): Promise<void> {
     speechHandleStorage.enterWith(speechHandle);
 
@@ -3261,7 +3279,11 @@ export class AgentActivity implements RecognitionHooks {
     }
 
     try {
-      const generationEvent = await this.realtimeSession.generateReply(instructions);
+      const generationEvent = await this.realtimeSession.generateReply(
+        instructions !== undefined
+          ? renderInstructions(instructions, speechHandle.inputDetails.modality)
+          : undefined,
+      );
       await this.realtimeGenerationTask(
         speechHandle,
         generationEvent,
diff --git a/agents/src/voice/agent_session.test.ts b/agents/src/voice/agent_session.test.ts
new file mode 100644
index 000000000..acb3c4a2a
--- /dev/null
+++ b/agents/src/voice/agent_session.test.ts
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { describe, expect, it, vi } from 'vitest';
+import { AgentSession } from './agent_session.js';
+import { SpeechHandle } from './speech_handle.js';
+
+describe('AgentSession.run', () => {
+  it('forwards inputModality to generateReply', async () => {
+    const session = new AgentSession();
+    const generateReply = vi
+      .spyOn(session, 'generateReply')
+      .mockImplementation(() => SpeechHandle.create());
+
+    session.run({ userInput: 'hello', inputModality: 'audio' });
+
+    await vi.waitFor(() => {
+      expect(generateReply).toHaveBeenCalledWith({
+        userInput: 'hello',
+        inputModality: 'audio',
+      });
+    });
+  });
+});
diff --git a/agents/src/voice/agent_session.ts b/agents/src/voice/agent_session.ts
index d5382562d..a495e97c7 100644
--- a/agents/src/voice/agent_session.ts
+++ b/agents/src/voice/agent_session.ts
@@ -22,7 +22,12 @@ import type { InterruptionDetectionError } from '../inference/interruption/error
 import type { OverlappingSpeechEvent } from '../inference/interruption/types.js';
 import { getJobContext } from '../job.js';
 import type { FunctionCall, FunctionCallOutput } from '../llm/chat_context.js';
-import { AgentHandoffItem, ChatContext, ChatMessage } from '../llm/chat_context.js';
+import {
+  AgentHandoffItem,
+  ChatContext,
+  ChatMessage,
+  type Instructions,
+} from '../llm/chat_context.js';
 import type { LLM, RealtimeModel, RealtimeModelError, ToolChoice } from '../llm/index.js';
 import type { LLMError } from '../llm/llm.js';
 import { log } from '../log.js';
@@ -752,9 +757,11 @@ export class AgentSession<
   generateReply(options?: {
     userInput?: string | ChatMessage;
     chatCtx?: ChatContext;
-    instructions?: string;
+    instructions?: string | Instructions;
     toolChoice?: ToolChoice;
     allowInterruptions?: boolean;
+    /** The input modality used for generating the reply. Defaults to `"text"`. */
+    inputModality?: 'audio' | 'text';
   }): SpeechHandle {
     if (!this.activity) {
       throw new Error('AgentSession is not running');
@@ -770,18 +777,20 @@ export class AgentSession<
             })
           : undefined;
 
+    const inputDetails = { modality: options?.inputModality ?? 'text' } as const;
+
     const doGenerateReply = (activity: AgentActivity, nextActivity?: AgentActivity) => {
       if (activity.schedulingPaused) {
         if (!nextActivity) {
           throw new Error('AgentSession is closing, cannot use generateReply()');
         }
-        return nextActivity.generateReply({ userMessage, ...options });
+        return nextActivity.generateReply({ userMessage, ...options, inputDetails });
       }
 
       // Handoff can race with scheduling pause between the check above and generateReply().
       // If that happens, retry on the next activity instead of surfacing an avoidable error.
       try {
-        return activity.generateReply({ userMessage, ...options });
+        return activity.generateReply({ userMessage, ...options, inputDetails });
       } catch (error) {
         const canFallback = nextActivity !== undefined && isSchedulingPausedError(error);
         if (!canFallback) {
@@ -791,7 +800,7 @@ export class AgentSession<
           { error },
           'generateReply scheduling raced with handoff drain; retrying on next activity',
         );
-        return nextActivity.generateReply({ userMessage, ...options });
+        return nextActivity.generateReply({ userMessage, ...options, inputDetails });
       }
     };
 
@@ -831,9 +840,11 @@ export class AgentSession<
    */
   run<T = unknown>({
     userInput,
+    inputModality,
     outputType,
   }: {
     userInput: string;
+    inputModality?: 'audio' | 'text';
     outputType?: z.ZodType<T>;
   }): RunResult<T> {
     if (this._globalRunState && !this._globalRunState.done()) {
@@ -857,7 +868,7 @@ export class AgentSession<
       try {
         const unlock = await this.activityLock.lock();
         unlock();
-        this.generateReply({ userInput });
+        this.generateReply({ userInput, inputModality });
       } catch (e) {
         runState._reject(asError(e));
       }
diff --git a/agents/src/voice/generation.ts b/agents/src/voice/generation.ts
index bc06f368e..3d938abdc 100644
--- a/agents/src/voice/generation.ts
+++ b/agents/src/voice/generation.ts
@@ -7,11 +7,13 @@ import { ThrowsPromise } from '@livekit/throws-transformer/throws';
 import type { Span } from '@opentelemetry/api';
 import { context as otelContext } from '@opentelemetry/api';
 import type { ReadableStream, ReadableStreamDefaultReader } from 'stream/web';
+import type { Instructions } from '../llm/chat_context.js';
 import {
   type ChatContext,
   ChatMessage,
   FunctionCall,
   FunctionCallOutput,
+  isInstructions,
 } from '../llm/chat_context.js';
 import type { ChatChunk } from '../llm/llm.js';
 import {
@@ -380,7 +382,7 @@ export function createToolOutput(params: {
   });
 }
 
-const INSTRUCTIONS_MESSAGE_ID = 'lk.agent_task.instructions';
+export const INSTRUCTIONS_MESSAGE_ID = 'lk.agent_task.instructions';
 
 /**
  * Update the instruction message in the chat context or insert a new one if missing.
@@ -395,7 +397,7 @@ const INSTRUCTIONS_MESSAGE_ID = 'lk.agent_task.instructions';
  */
 export function updateInstructions(options: {
   chatCtx: ChatContext;
-  instructions: string;
+  instructions: string | Instructions;
   addIfMissing: boolean;
 }) {
   const { chatCtx, instructions, addIfMissing } = options;
@@ -425,6 +427,43 @@ export function updateInstructions(options: {
   }
 }
 
+/**
+ * Apply the correct {@link Instructions} variant for the turn's input modality.
+ *
+ * Locates the instructions message (by {@link INSTRUCTIONS_MESSAGE_ID}) and,
+ * if its content contains any {@link Instructions} entries, rebuilds the
+ * message so each Instructions renders as the chosen variant. No-op when no
+ * modality-aware instructions are present.
+ */
+export function applyInstructionsModality(
+  chatCtx: ChatContext,
+  options: { modality: 'audio' | 'text' },
+) {
+  const { modality } = options;
+  const idx = chatCtx.indexById(INSTRUCTIONS_MESSAGE_ID);
+  if (idx === undefined) return;
+
+  const item = chatCtx.items[idx]!;
+  if (item.type !== 'message') return;
+
+  const hasModalitySpecific = item.content.some((c) => isInstructions(c));
+  if (!hasModalitySpecific) return;
+
+  // ChatContext.copy shadows the original item; create a new instance so the
+  // base context's content isn't mutated when the same Instructions is reused
+  // across turns.
+  chatCtx.items[idx] = ChatMessage.create({
+    id: item.id,
+    role: item.role,
+    content: item.content.map((c) => (isInstructions(c) ? c.asModality(modality) : c)),
+    interrupted: item.interrupted,
+    createdAt: item.createdAt,
+    transcriptConfidence: item.transcriptConfidence,
+    metrics: item.metrics,
+    extra: item.extra,
+  });
+}
+
 export function performLLMInference(
   node: LLMNode,
   chatCtx: ChatContext,
diff --git a/agents/src/voice/remote_session.ts b/agents/src/voice/remote_session.ts
index b8cb359a8..f970b064e 100644
--- a/agents/src/voice/remote_session.ts
+++ b/agents/src/voice/remote_session.ts
@@ -14,6 +14,7 @@ import type {
   FunctionCall as FCItem,
   FunctionCallOutput as FCOItem,
 } from '../llm/chat_context.js';
+import { isInstructions, renderInstructions } from '../llm/chat_context.js';
 import type { ToolContext } from '../llm/tool_context.js';
 import { log } from '../log.js';
 import type {
@@ -295,6 +296,10 @@ function chatItemToProto(item: RemoteChatItem): pb.ChatContext_ChatItem {
       for (const c of msg.content) {
         if (typeof c === 'string') {
           content.push(new pb.ChatMessage_ChatContent({ payload: { case: 'text', value: c } }));
+        } else if (isInstructions(c)) {
+          content.push(
+            new pb.ChatMessage_ChatContent({ payload: { case: 'text', value: c.value } }),
+          );
         }
       }
 
@@ -800,7 +805,7 @@ export class SessionHost {
       case: 'getAgentInfo',
       value: new pb.SessionResponse_GetAgentInfoResponse({
         id: agent.id,
-        instructions: agent.instructions,
+        instructions: renderInstructions(agent.instructions),
         tools: toolNames(agent.toolCtx),
         chatCtx: chatItemsToProto(agent.chatCtx.items),
       }),
diff --git a/agents/src/voice/speech_handle.ts b/agents/src/voice/speech_handle.ts
index 3e3ccf540..645729fab 100644
--- a/agents/src/voice/speech_handle.ts
+++ b/agents/src/voice/speech_handle.ts
@@ -55,6 +55,17 @@ export class SpeechHandleCircularWaitError extends Error {
   }
 }
 
+/**
+ * Describes how the user provided input that triggered the current turn.
+ * Used by modality-aware Instructions to pick the correct variant.
+ */
+export interface InputDetails {
+  modality: 'audio' | 'text';
+}
+
+/** Default {@link InputDetails} used when no explicit value is provided. */
+export const DEFAULT_INPUT_DETAILS: InputDetails = { modality: 'audio' };
+
 export class SpeechHandle {
   /** Priority for messages that should be played after all other messages in the queue */
   static SPEECH_PRIORITY_LOW = 0;
@@ -93,6 +104,7 @@ export class SpeechHandle {
     private _allowInterruptions: boolean,
     /** @internal */
     public _stepIndex: number,
+    private _inputDetails: InputDetails = DEFAULT_INPUT_DETAILS,
     readonly parent?: SpeechHandle,
   ) {
     this.doneFut.await.finally(() => {
@@ -105,11 +117,27 @@ export class SpeechHandle {
   static create(options?: {
     allowInterruptions?: boolean;
     stepIndex?: number;
+    inputDetails?: InputDetails;
     parent?: SpeechHandle;
   }) {
-    const { allowInterruptions = true, stepIndex = 0, parent } = options ?? {};
+    const {
+      allowInterruptions = true,
+      stepIndex = 0,
+      inputDetails = DEFAULT_INPUT_DETAILS,
+      parent,
+    } = options ?? {};
+
+    return new SpeechHandle(
+      shortuuid('speech_'),
+      allowInterruptions,
+      stepIndex,
+      inputDetails,
+      parent,
+    );
+  }
 
-    return new SpeechHandle(shortuuid('speech_'), allowInterruptions, stepIndex, parent);
+  get inputDetails(): InputDetails {
+    return this._inputDetails;
   }
 
   get interrupted(): boolean {
diff --git a/examples/src/instructions_per_modality.ts b/examples/src/instructions_per_modality.ts
new file mode 100644
index 000000000..71f2f3f04
--- /dev/null
+++ b/examples/src/instructions_per_modality.ts
@@ -0,0 +1,100 @@
+// SPDX-FileCopyrightText: 2026 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import {
+  type JobContext,
+  type JobProcess,
+  ServerOptions,
+  cli,
+  defineAgent,
+  inference,
+  llm,
+  log,
+  voice,
+} from '@livekit/agents';
+import * as silero from '@livekit/agents-plugin-silero';
+import { fileURLToPath } from 'node:url';
+import { z } from 'zod';
+
+const BASE_INSTRUCTIONS = (modalitySpecific: string, currentDate: string) =>
+  `You are a scheduling assistant named Alex that helps users book appointments.
+${modalitySpecific}
+Call \`book_appointment\` to finalise the booking.
+Never invent or assume details the user did not provide — ask for them instead.
+The current date is ${currentDate}.
+`;
+
+// Voice users speak in approximate, self-correcting natural language.
+// The LLM needs guidance on how to parse what was said, not how to say things back.
+const AUDIO_SPECIFIC = `
+The user is speaking — their input arrives as voice transcription and may be imperfect.
+When interpreting what the user said:
+- Resolve relative spoken expressions to a concrete date/time: 'next Tuesday', 'tomorrow afternoon', 'the week after next around 3'.
+- Spoken numbers may be ambiguous: 'three thirty' could mean 3:30 PM or the 30th of March — ask for clarification when context does not make it obvious.
+- Honor verbal self-corrections: if the user says 'wait, I meant Thursday not Tuesday', update your understanding to Thursday and discard Tuesday.
+- Ignore filler words and hesitations ('um', 'uh', 'like', 'I guess').
+- Always confirm the resolved date and time out loud before booking, since spoken input is inherently ambiguous.
+`;
+
+// Text users type precise values — no need to normalise spoken patterns.
+const TEXT_SPECIFIC = `
+The user is typing — take their input literally.
+When interpreting what the user wrote:
+- Accept exact dates and times in any common format (ISO, natural language, 12-hour or 24-hour clock).
+- If the user provides a complete and unambiguous date and time, you may book immediately without asking for confirmation.
+- Only ask follow-up questions for genuinely missing information.
+`;
+
+class SchedulingAgent extends voice.Agent {
+  constructor() {
+    const now = new Date();
+    const weekday = now.toLocaleDateString(undefined, { weekday: 'long' });
+    const currentDate = `${now.toISOString().slice(0, 10)} ${weekday}`;
+    const instructions = new llm.Instructions({
+      audio: BASE_INSTRUCTIONS(AUDIO_SPECIFIC, currentDate),
+      text: BASE_INSTRUCTIONS(TEXT_SPECIFIC, currentDate),
+    });
+
+    super({
+      instructions,
+      tools: {
+        bookAppointment: llm.tool({
+          description: 'Book an appointment.',
+          parameters: z.object({
+            date: z.string().describe('The date of the appointment in the format YYYY-MM-DD'),
+            time: z.string().describe('The time of the appointment in the format HH:MM'),
+          }),
+          execute: async ({ date, time }) => {
+            log().info(`booking appointment for ${date} at ${time}`);
+            return `Appointment booked for ${date} at ${time}`;
+          },
+        }),
+      },
+    });
+  }
+
+  async onEnter(): Promise<void> {
+    this.session.generateReply();
+  }
+}
+
+export default defineAgent({
+  prewarm: async (proc: JobProcess) => {
+    proc.userData.vad = await silero.VAD.load();
+  },
+  entry: async (ctx: JobContext) => {
+    const session = new voice.AgentSession({
+      vad: ctx.proc.userData.vad! as silero.VAD,
+      stt: new inference.STT({ model: 'deepgram/nova-3' }),
+      llm: new inference.LLM({ model: 'openai/gpt-4.1-mini' }),
+      tts: new inference.TTS({
+        model: 'cartesia/sonic-3',
+        voice: '9626c31c-bec5-4cca-baa8-f8ba9e84c8bc',
+      }),
+    });
+
+    await session.start({ agent: new SchedulingAgent(), room: ctx.room });
+  },
+});
+
+cli.runApp(new ServerOptions({ agent: fileURLToPath(import.meta.url) }));