From 8540cdba92d8581c2eabd6507021663360726ca3 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Thu, 25 Jun 2026 20:58:07 -0700
Subject: [PATCH 01/19] feat(web): add language model inputModalities
 capability plumbing

Add an optional `inputModalities` declaration to language model config and
expose a resolved capability set to the client.

- Schema: add optional `inputModalities` (`text` | `image` | `pdf`) to every
  provider definition in `schemas/v3/languageModel.json` and regenerate the
  schema types/snippets.
- Add a fail-closed `resolveModelInputModalities` resolver that defaults to
  text-only when a model does not declare its input modalities.
- Expose the resolved `inputModalities` on the client-safe `LanguageModelInfo`
  (populated via `getConfiguredLanguageModelsInfo` and the MCP ask path).

This is groundwork for chat file attachments. It adds no attachment UI and no
live provider capability probing yet.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/snippets/schemas/v3/index.schema.mdx     | 264 ++++++++++++++++++
 .../schemas/v3/languageModel.schema.mdx       | 264 ++++++++++++++++++
 packages/schemas/src/v3/index.schema.ts       | 264 ++++++++++++++++++
 packages/schemas/src/v3/index.type.ts         |  48 ++++
 .../schemas/src/v3/languageModel.schema.ts    | 264 ++++++++++++++++++
 packages/schemas/src/v3/languageModel.type.ts |  48 ++++
 .../web/src/ee/features/mcp/askCodebase.ts    |   2 +
 .../src/features/chat/modelCapabilities.ts    |  13 +
 packages/web/src/features/chat/types.ts       |   4 +
 .../web/src/features/chat/utils.server.ts     |   2 +
 schemas/v3/languageModel.json                 | 134 ++++++++-
 11 files changed, 1306 insertions(+), 1 deletion(-)
 create mode 100644 packages/web/src/features/chat/modelCapabilities.ts

diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index 864359251..e0b00c540 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1860,6 +1860,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -1998,6 +2009,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2133,6 +2155,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2240,6 +2273,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2361,6 +2405,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2484,6 +2539,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2623,6 +2689,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2730,6 +2807,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2863,6 +2951,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3027,6 +3126,17 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3135,6 +3245,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3246,6 +3367,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3426,6 +3558,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3564,6 +3707,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3699,6 +3853,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3806,6 +3971,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3927,6 +4103,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4050,6 +4237,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4189,6 +4387,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4296,6 +4505,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4429,6 +4649,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4593,6 +4824,17 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4701,6 +4943,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4812,6 +5065,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 90aee08af..7c7874207 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -174,6 +174,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -312,6 +323,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -447,6 +469,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -554,6 +587,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -675,6 +719,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -798,6 +853,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -937,6 +1003,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1044,6 +1121,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1177,6 +1265,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1341,6 +1440,17 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1449,6 +1559,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1560,6 +1681,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1740,6 +1872,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1878,6 +2021,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2013,6 +2167,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2120,6 +2285,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2241,6 +2417,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2364,6 +2551,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2503,6 +2701,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2610,6 +2819,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2743,6 +2963,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2907,6 +3138,17 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3015,6 +3257,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3126,6 +3379,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 8c1d64b52..257c8ae7d 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1859,6 +1859,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -1997,6 +2008,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2132,6 +2154,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2239,6 +2272,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2360,6 +2404,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2483,6 +2538,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2622,6 +2688,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2729,6 +2806,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2862,6 +2950,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3026,6 +3125,17 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3134,6 +3244,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3245,6 +3366,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3425,6 +3557,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3563,6 +3706,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3698,6 +3852,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3805,6 +3970,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3926,6 +4102,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4049,6 +4236,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4188,6 +4386,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4295,6 +4504,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4428,6 +4648,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4592,6 +4823,17 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4700,6 +4942,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4811,6 +5064,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index 7fa7f5a17..85dbaac43 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -762,6 +762,10 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional headers to use with the model.
@@ -842,6 +846,10 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface AzureLanguageModel {
   /**
@@ -897,6 +905,10 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -936,6 +948,10 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -983,6 +999,10 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -1030,6 +1050,10 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -1085,6 +1109,10 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface MistralLanguageModel {
   /**
@@ -1124,6 +1152,10 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -1171,6 +1203,10 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -1215,6 +1251,10 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -1279,6 +1319,10 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface XaiLanguageModel {
   /**
@@ -1318,6 +1362,10 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GitHubAppConfig {
   /**
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index ab418ce79..85c2bf8a8 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -173,6 +173,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -311,6 +322,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -446,6 +468,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -553,6 +586,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -674,6 +718,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -797,6 +852,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -936,6 +1002,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1043,6 +1120,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1176,6 +1264,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1340,6 +1439,17 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1448,6 +1558,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1559,6 +1680,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1739,6 +1871,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1877,6 +2020,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2012,6 +2166,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2119,6 +2284,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2240,6 +2416,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2363,6 +2550,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2502,6 +2700,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2609,6 +2818,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2742,6 +2962,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2906,6 +3137,17 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3014,6 +3256,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3125,6 +3378,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index 5c3b25668..df4569ee8 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -88,6 +88,10 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional headers to use with the model.
@@ -168,6 +172,10 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface AzureLanguageModel {
   /**
@@ -223,6 +231,10 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -262,6 +274,10 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -309,6 +325,10 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -356,6 +376,10 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -411,6 +435,10 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface MistralLanguageModel {
   /**
@@ -450,6 +478,10 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -497,6 +529,10 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -541,6 +577,10 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -605,6 +645,10 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface XaiLanguageModel {
   /**
@@ -644,4 +688,8 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
diff --git a/packages/web/src/ee/features/mcp/askCodebase.ts b/packages/web/src/ee/features/mcp/askCodebase.ts
index 4b7cfb7b0..8b2432fb5 100644
--- a/packages/web/src/ee/features/mcp/askCodebase.ts
+++ b/packages/web/src/ee/features/mcp/askCodebase.ts
@@ -4,6 +4,7 @@ import { generateChatNameFromMessage } from "@/ee/features/chat/llm.server";
 import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server";
 import { LanguageModelInfo, SBChatMessage, SearchScope } from "@/features/chat/types";
 import { convertLLMOutputToPortableMarkdown, getAnswerPartFromAssistantMessage, getLanguageModelKey } from "@/features/chat/utils";
+import { resolveModelInputModalities } from "@/features/chat/modelCapabilities";
 import { ErrorCode } from "@/lib/errorCodes";
 import { ServiceError, ServiceErrorException } from "@/lib/serviceError";
 import { withOptionalAuth } from "@/middleware/withAuth";
@@ -243,6 +244,7 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
                     provider: languageModelConfig.provider,
                     model: languageModelConfig.model,
                     displayName: languageModelConfig.displayName,
+                    inputModalities: resolveModelInputModalities(languageModelConfig),
                 },
             } satisfies AskCodebaseResult;
         })
diff --git a/packages/web/src/features/chat/modelCapabilities.ts b/packages/web/src/features/chat/modelCapabilities.ts
new file mode 100644
index 000000000..4dbe9bcd6
--- /dev/null
+++ b/packages/web/src/features/chat/modelCapabilities.ts
@@ -0,0 +1,13 @@
+import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
+import { InputModality } from './types';
+
+// Fail-closed: when a model does not declare input modalities, assume text-only.
+// NOTE: future work may add live provider capability probing (see
+// tryResolveAnthropicThinkingConfig in llm.server.ts for the precedent).
+export const resolveModelInputModalities = (config: LanguageModel): InputModality[] => {
+    const declared = config.inputModalities;
+    if (declared && declared.length > 0) {
+        return declared;
+    }
+    return ['text'];
+}
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index 38a737a09..615fe2b1c 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -208,10 +208,13 @@ type _AssertAllProviders = LanguageModelProvider extends typeof languageModelPro
 const _assertAllProviders: _AssertAllProviders = true;
 void _assertAllProviders;
 
+export type InputModality = 'text' | 'image' | 'pdf';
+
 export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
+    inputModalities: z.array(z.enum(['text', 'image', 'pdf'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
 });
 
 /**
@@ -221,6 +224,7 @@ export type LanguageModelInfo = {
     provider: LanguageModelProvider,
     model: LanguageModel['model'],
     displayName?: LanguageModel['displayName'],
+    inputModalities: InputModality[],
 }
 
 // Additional request body data that we send along to the chat API.
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index ffc3483a4..7ec47b677 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -7,6 +7,7 @@ import { env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
+import { resolveModelInputModalities } from './modelCapabilities';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
@@ -131,5 +132,6 @@ export const getConfiguredLanguageModelsInfo = async () => {
         provider: model.provider,
         model: model.model,
         displayName: model.displayName,
+        inputModalities: resolveModelInputModalities(model),
     }));
 };
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index 3f1d13d52..0fb96217a 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -50,6 +50,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -93,6 +104,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -160,6 +182,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -199,6 +232,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -252,6 +296,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -307,6 +362,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -378,6 +444,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -417,6 +494,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -482,6 +570,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -537,6 +636,17 @@
                 "temperature": {
                     "type": "number",
                     "description": "Optional temperature setting to use with the model."
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -577,6 +687,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -620,6 +741,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -667,4 +799,4 @@
             "$ref": "#/definitions/XaiLanguageModel"
         }
     ]
-}
\ No newline at end of file
+}

From a473b49cd8de430e00183305a563f34dba39c113 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Thu, 25 Jun 2026 20:58:48 -0700
Subject: [PATCH 02/19] docs: add CHANGELOG entry for language model
 inputModalities

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 689718d36..5163f833a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353)
+- Added optional `inputModalities` configuration for language models, exposing model input-modality capabilities (defaults to text-only). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
 
 ### Fixed
 - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367)

From 4b57d279bea951a86e806b46e555a30eebe615dc Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:08:53 -0700
Subject: [PATCH 03/19] refactor(schemas): split document types out of
 inputModalities

inputModalities now only enumerates true perceptual channels
(text | image | audio | video). Document/container formats like PDF
move to a separate fail-closed `supportedDocumentTypes` field, since
PDF is not a model modality but a format providers decompose into
text/image internally.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 CHANGELOG.md                                  |   2 +-
 docs/snippets/schemas/v3/index.schema.mdx     | 288 ++++++++++++++++--
 .../schemas/v3/languageModel.schema.mdx       | 288 ++++++++++++++++--
 packages/schemas/src/v3/index.schema.ts       | 288 ++++++++++++++++--
 packages/schemas/src/v3/index.type.ts         |  96 ++++--
 .../schemas/src/v3/languageModel.schema.ts    | 288 ++++++++++++++++--
 packages/schemas/src/v3/languageModel.type.ts |  96 ++++--
 .../web/src/ee/features/mcp/askCodebase.ts    |   3 +-
 .../src/features/chat/modelCapabilities.ts    |  13 +-
 packages/web/src/features/chat/types.ts       |   7 +-
 .../web/src/features/chat/utils.server.ts     |   3 +-
 schemas/v3/languageModel.json                 | 144 ++++++++-
 12 files changed, 1354 insertions(+), 162 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5163f833a..caa90e9b1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353)
-- Added optional `inputModalities` configuration for language models, exposing model input-modality capabilities (defaults to text-only). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
+- Added optional `inputModalities` and `supportedDocumentTypes` configuration for language models, exposing model input-modality and document capabilities (defaults to text-only, no documents). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
 
 ### Fixed
 - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367)
diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index e0b00c540..5b099d724 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1867,10 +1867,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2016,10 +2026,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2162,10 +2182,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2280,10 +2310,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2412,10 +2452,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2546,10 +2596,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2696,10 +2756,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2814,10 +2884,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2958,10 +3038,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3133,10 +3223,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3252,10 +3352,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3374,10 +3484,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3565,10 +3685,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3714,10 +3844,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3860,10 +4000,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3978,10 +4128,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4110,10 +4270,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4244,10 +4414,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4394,10 +4574,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4512,10 +4702,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4656,10 +4856,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4831,10 +5041,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4950,10 +5170,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -5072,10 +5302,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 7c7874207..7b1e774cf 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -181,10 +181,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -330,10 +340,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -476,10 +496,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -594,10 +624,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -726,10 +766,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -860,10 +910,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1010,10 +1070,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1128,10 +1198,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1272,10 +1352,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1447,10 +1537,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1566,10 +1666,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1688,10 +1798,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1879,10 +1999,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2028,10 +2158,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2174,10 +2314,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2292,10 +2442,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2424,10 +2584,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2558,10 +2728,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2708,10 +2888,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2826,10 +3016,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2970,10 +3170,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3145,10 +3355,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3264,10 +3484,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3386,10 +3616,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 257c8ae7d..7d051544c 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1866,10 +1866,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2015,10 +2025,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2161,10 +2181,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2279,10 +2309,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2411,10 +2451,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2545,10 +2595,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2695,10 +2755,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2813,10 +2883,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2957,10 +3037,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3132,10 +3222,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3251,10 +3351,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3373,10 +3483,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3564,10 +3684,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3713,10 +3843,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3859,10 +3999,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3977,10 +4127,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4109,10 +4269,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4243,10 +4413,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4393,10 +4573,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4511,10 +4701,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4655,10 +4855,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4830,10 +5040,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4949,10 +5169,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -5071,10 +5301,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index 85dbaac43..14c8c14e2 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -763,9 +763,13 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -847,9 +851,13 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -906,9 +914,13 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -949,9 +961,13 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -1000,9 +1016,13 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -1051,9 +1071,13 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -1110,9 +1134,13 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -1153,9 +1181,13 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -1204,9 +1236,13 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -1252,9 +1288,13 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -1320,9 +1360,13 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -1363,9 +1407,13 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GitHubAppConfig {
   /**
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index 85c2bf8a8..9c9ae7b2d 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -180,10 +180,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -329,10 +339,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -475,10 +495,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -593,10 +623,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -725,10 +765,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -859,10 +909,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1009,10 +1069,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1127,10 +1197,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1271,10 +1351,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1446,10 +1536,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1565,10 +1665,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1687,10 +1797,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1878,10 +1998,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2027,10 +2157,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2173,10 +2313,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2291,10 +2441,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2423,10 +2583,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2557,10 +2727,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2707,10 +2887,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2825,10 +3015,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2969,10 +3169,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3144,10 +3354,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3263,10 +3483,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3385,10 +3615,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index df4569ee8..3297689b7 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -89,9 +89,13 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -173,9 +177,13 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -232,9 +240,13 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -275,9 +287,13 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -326,9 +342,13 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -377,9 +397,13 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -436,9 +460,13 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -479,9 +507,13 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -530,9 +562,13 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -578,9 +614,13 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -646,9 +686,13 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -689,7 +733,11 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/web/src/ee/features/mcp/askCodebase.ts b/packages/web/src/ee/features/mcp/askCodebase.ts
index 8b2432fb5..7f779ffc8 100644
--- a/packages/web/src/ee/features/mcp/askCodebase.ts
+++ b/packages/web/src/ee/features/mcp/askCodebase.ts
@@ -4,7 +4,7 @@ import { generateChatNameFromMessage } from "@/ee/features/chat/llm.server";
 import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server";
 import { LanguageModelInfo, SBChatMessage, SearchScope } from "@/features/chat/types";
 import { convertLLMOutputToPortableMarkdown, getAnswerPartFromAssistantMessage, getLanguageModelKey } from "@/features/chat/utils";
-import { resolveModelInputModalities } from "@/features/chat/modelCapabilities";
+import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from "@/features/chat/modelCapabilities";
 import { ErrorCode } from "@/lib/errorCodes";
 import { ServiceError, ServiceErrorException } from "@/lib/serviceError";
 import { withOptionalAuth } from "@/middleware/withAuth";
@@ -245,6 +245,7 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
                     model: languageModelConfig.model,
                     displayName: languageModelConfig.displayName,
                     inputModalities: resolveModelInputModalities(languageModelConfig),
+                    supportedDocumentTypes: resolveModelSupportedDocumentTypes(languageModelConfig),
                 },
             } satisfies AskCodebaseResult;
         })
diff --git a/packages/web/src/features/chat/modelCapabilities.ts b/packages/web/src/features/chat/modelCapabilities.ts
index 4dbe9bcd6..8b976af59 100644
--- a/packages/web/src/features/chat/modelCapabilities.ts
+++ b/packages/web/src/features/chat/modelCapabilities.ts
@@ -1,5 +1,5 @@
 import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { InputModality } from './types';
+import { DocumentType, InputModality } from './types';
 
 // Fail-closed: when a model does not declare input modalities, assume text-only.
 // NOTE: future work may add live provider capability probing (see
@@ -11,3 +11,14 @@ export const resolveModelInputModalities = (config: LanguageModel): InputModalit
     }
     return ['text'];
 }
+
+// Fail-closed: when a model does not declare supported document types, assume none.
+// Document types (e.g. PDF) are container formats distinct from raw input
+// modalities, since providers decompose them into text/image internally.
+export const resolveModelSupportedDocumentTypes = (config: LanguageModel): DocumentType[] => {
+    const declared = config.supportedDocumentTypes;
+    if (declared && declared.length > 0) {
+        return declared;
+    }
+    return [];
+}
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index 615fe2b1c..e1daf0bdb 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -208,13 +208,15 @@ type _AssertAllProviders = LanguageModelProvider extends typeof languageModelPro
 const _assertAllProviders: _AssertAllProviders = true;
 void _assertAllProviders;
 
-export type InputModality = 'text' | 'image' | 'pdf';
+export type InputModality = 'text' | 'image' | 'audio' | 'video';
+export type DocumentType = 'pdf';
 
 export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
-    inputModalities: z.array(z.enum(['text', 'image', 'pdf'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
+    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
+    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("The document/file container formats the model can ingest natively. Defaults to none."),
 });
 
 /**
@@ -225,6 +227,7 @@ export type LanguageModelInfo = {
     model: LanguageModel['model'],
     displayName?: LanguageModel['displayName'],
     inputModalities: InputModality[],
+    supportedDocumentTypes: DocumentType[],
 }
 
 // Additional request body data that we send along to the chat API.
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index 7ec47b677..0b04226d8 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -7,7 +7,7 @@ import { env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
-import { resolveModelInputModalities } from './modelCapabilities';
+import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from './modelCapabilities';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
@@ -133,5 +133,6 @@ export const getConfiguredLanguageModelsInfo = async () => {
         model: model.model,
         displayName: model.displayName,
         inputModalities: resolveModelInputModalities(model),
+        supportedDocumentTypes: resolveModelSupportedDocumentTypes(model),
     }));
 };
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index 0fb96217a..e49707484 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -57,10 +57,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -111,10 +121,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -189,10 +209,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -239,10 +269,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -303,10 +343,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -369,10 +419,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -451,10 +511,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -501,10 +571,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -577,10 +657,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -643,10 +733,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -694,10 +794,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -748,10 +858,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [

From 0baabcba43e86432a1a69846d90a47c83b62499f Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:17:14 -0700
Subject: [PATCH 04/19] docs(schemas): clarify what counts as a document type

Tighten the inputModalities / supportedDocumentTypes descriptions to
remove the implication that omitting supportedDocumentTypes blocks all
non-text attachments. Clarify the taxonomy: single-medium files
(images, audio, video) and plain-text files (.txt, .md) are governed by
inputModalities; supportedDocumentTypes only gates rich compound
container formats like PDF.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/snippets/schemas/v3/index.schema.mdx     | 96 +++++++++----------
 .../schemas/v3/languageModel.schema.mdx       | 96 +++++++++----------
 packages/schemas/src/v3/index.schema.ts       | 96 +++++++++----------
 packages/schemas/src/v3/index.type.ts         | 48 +++++-----
 .../schemas/src/v3/languageModel.schema.ts    | 96 +++++++++----------
 packages/schemas/src/v3/languageModel.type.ts | 48 +++++-----
 packages/web/src/features/chat/types.ts       |  4 +-
 schemas/v3/languageModel.json                 | 48 +++++-----
 8 files changed, 266 insertions(+), 266 deletions(-)

diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index 5b099d724..356da2009 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1871,7 +1871,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -1880,7 +1880,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2030,7 +2030,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2039,7 +2039,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2186,7 +2186,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2195,7 +2195,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2314,7 +2314,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2323,7 +2323,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2456,7 +2456,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2465,7 +2465,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2600,7 +2600,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2609,7 +2609,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2760,7 +2760,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2769,7 +2769,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2888,7 +2888,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2897,7 +2897,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3042,7 +3042,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3051,7 +3051,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3227,7 +3227,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3236,7 +3236,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3356,7 +3356,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3365,7 +3365,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3488,7 +3488,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3497,7 +3497,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3689,7 +3689,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3698,7 +3698,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3848,7 +3848,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3857,7 +3857,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4004,7 +4004,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4013,7 +4013,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4132,7 +4132,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4141,7 +4141,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4274,7 +4274,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4283,7 +4283,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4418,7 +4418,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4427,7 +4427,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4578,7 +4578,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4587,7 +4587,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4706,7 +4706,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4715,7 +4715,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4860,7 +4860,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4869,7 +4869,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5045,7 +5045,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5054,7 +5054,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5174,7 +5174,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5183,7 +5183,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5306,7 +5306,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5315,7 +5315,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 7b1e774cf..5af4b3d96 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -185,7 +185,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -194,7 +194,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -344,7 +344,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -353,7 +353,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -500,7 +500,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -509,7 +509,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -628,7 +628,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -637,7 +637,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -770,7 +770,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -779,7 +779,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -914,7 +914,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -923,7 +923,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1074,7 +1074,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1083,7 +1083,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1202,7 +1202,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1211,7 +1211,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1356,7 +1356,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1365,7 +1365,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1541,7 +1541,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1550,7 +1550,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1670,7 +1670,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1679,7 +1679,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1802,7 +1802,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1811,7 +1811,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2003,7 +2003,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2012,7 +2012,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2162,7 +2162,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2171,7 +2171,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2318,7 +2318,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2327,7 +2327,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2446,7 +2446,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2455,7 +2455,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2588,7 +2588,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2597,7 +2597,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2732,7 +2732,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2741,7 +2741,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2892,7 +2892,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2901,7 +2901,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3020,7 +3020,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3029,7 +3029,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3174,7 +3174,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3183,7 +3183,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3359,7 +3359,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3368,7 +3368,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3488,7 +3488,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3497,7 +3497,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3620,7 +3620,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3629,7 +3629,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 7d051544c..123fd4a8b 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1870,7 +1870,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -1879,7 +1879,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2029,7 +2029,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2038,7 +2038,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2185,7 +2185,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2194,7 +2194,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2313,7 +2313,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2322,7 +2322,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2455,7 +2455,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2464,7 +2464,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2599,7 +2599,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2608,7 +2608,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2759,7 +2759,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2768,7 +2768,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2887,7 +2887,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2896,7 +2896,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3041,7 +3041,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3050,7 +3050,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3226,7 +3226,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3235,7 +3235,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3355,7 +3355,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3364,7 +3364,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3487,7 +3487,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3496,7 +3496,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3688,7 +3688,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3697,7 +3697,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3847,7 +3847,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3856,7 +3856,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4003,7 +4003,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4012,7 +4012,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4131,7 +4131,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4140,7 +4140,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4273,7 +4273,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4282,7 +4282,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4417,7 +4417,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4426,7 +4426,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4577,7 +4577,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4586,7 +4586,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4705,7 +4705,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4714,7 +4714,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4859,7 +4859,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4868,7 +4868,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5044,7 +5044,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5053,7 +5053,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5173,7 +5173,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5182,7 +5182,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5305,7 +5305,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5314,7 +5314,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index 14c8c14e2..d6f555e8d 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -763,11 +763,11 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -851,11 +851,11 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -914,11 +914,11 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -961,11 +961,11 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1016,11 +1016,11 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1071,11 +1071,11 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1134,11 +1134,11 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1181,11 +1181,11 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1236,11 +1236,11 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1288,11 +1288,11 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1360,11 +1360,11 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1407,11 +1407,11 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index 9c9ae7b2d..61cc0adf3 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -184,7 +184,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -193,7 +193,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -343,7 +343,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -352,7 +352,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -499,7 +499,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -508,7 +508,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -627,7 +627,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -636,7 +636,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -769,7 +769,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -778,7 +778,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -913,7 +913,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -922,7 +922,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1073,7 +1073,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1082,7 +1082,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1201,7 +1201,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1210,7 +1210,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1355,7 +1355,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1364,7 +1364,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1540,7 +1540,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1549,7 +1549,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1669,7 +1669,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1678,7 +1678,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1801,7 +1801,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1810,7 +1810,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2002,7 +2002,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2011,7 +2011,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2161,7 +2161,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2170,7 +2170,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2317,7 +2317,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2326,7 +2326,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2445,7 +2445,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2454,7 +2454,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2587,7 +2587,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2596,7 +2596,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2731,7 +2731,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2740,7 +2740,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2891,7 +2891,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2900,7 +2900,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3019,7 +3019,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3028,7 +3028,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3173,7 +3173,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3182,7 +3182,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3358,7 +3358,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3367,7 +3367,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3487,7 +3487,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3496,7 +3496,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3619,7 +3619,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3628,7 +3628,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index 3297689b7..90a53b423 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -89,11 +89,11 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -177,11 +177,11 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -240,11 +240,11 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -287,11 +287,11 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -342,11 +342,11 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -397,11 +397,11 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -460,11 +460,11 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -507,11 +507,11 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -562,11 +562,11 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -614,11 +614,11 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -686,11 +686,11 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -733,11 +733,11 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index e1daf0bdb..3547c5d0a 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -215,8 +215,8 @@ export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
-    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
-    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("The document/file container formats the model can ingest natively. Defaults to none."),
+    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept (images, audio, video, text). Single-medium attachments are gated by these. Defaults to text-only."),
+    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("Rich compound document formats (e.g. PDF) the model can ingest natively, distinct from single-medium attachments gated by inputModalities. Defaults to none."),
 });
 
 /**
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index e49707484..a952554b9 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -61,7 +61,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -70,7 +70,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -125,7 +125,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -134,7 +134,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -213,7 +213,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -222,7 +222,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -273,7 +273,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -282,7 +282,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -347,7 +347,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -356,7 +356,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -423,7 +423,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -432,7 +432,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -515,7 +515,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -524,7 +524,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -575,7 +575,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -584,7 +584,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -661,7 +661,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -670,7 +670,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -737,7 +737,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -746,7 +746,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -798,7 +798,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -807,7 +807,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -862,7 +862,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -871,7 +871,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [

From 5e4045b0ef25d95ac740961d0368b5250e820daf Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:25:57 -0700
Subject: [PATCH 05/19] fix(web): widen getLanguageModelKey param to keyable
 subset

LanguageModelInfo now has required inputModalities/supportedDocumentTypes,
so a raw LanguageModel config (where those are optional) is no longer
assignable to it. getLanguageModelKey only reads provider/model/displayName,
so type its parameter as that Pick subset, letting both LanguageModel and
LanguageModelInfo be keyed. Fixes the docker build type check.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 packages/web/src/features/chat/utils.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/web/src/features/chat/utils.ts b/packages/web/src/features/chat/utils.ts
index c7f409ac7..b103ada7c 100644
--- a/packages/web/src/features/chat/utils.ts
+++ b/packages/web/src/features/chat/utils.ts
@@ -422,9 +422,11 @@ export const getAnswerPartFromAssistantMessage = (message: SBChatMessage, isTurn
 }
 
 /**
- * Generates a unique key given a LanguageModelInfo object.
+ * Generates a unique key for a language model. Accepts any object carrying the
+ * identifying fields, so both the full `LanguageModel` config and the
+ * client-safe `LanguageModelInfo` can be keyed with it.
  */
-export const getLanguageModelKey = (model: LanguageModelInfo) => {
+export const getLanguageModelKey = (model: Pick<LanguageModelInfo, 'provider' | 'model' | 'displayName'>) => {
     return `${model.provider}-${model.model}-${model.displayName}`;
 }
 

From 507d7586cb2f10f8ae166629cb36af19872d4e6d Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:45:05 -0700
Subject: [PATCH 06/19] chore(schemas,web): keep schema dist fresh and resolve
 types from source

Two dev-experience fixes for the stale-build-output footgun:

- schemas watch now runs `yarn build` (generate + tsc) instead of
  generate-only, so editing a schema JSON during `yarn dev` refreshes
  dist (both the .d.ts types and the runtime index.schema.js used by
  ajv), not just the generated source.
- web tsconfig maps @sourcebot/schemas/v3|v2/* to the package source,
  so type-checking and the IDE read committed source directly instead
  of stale built .d.ts. Web only imports .type files (erased at
  compile), so there is no bundling/runtime impact.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 packages/schemas/package.json | 2 +-
 packages/web/tsconfig.json    | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/schemas/package.json b/packages/schemas/package.json
index 13fe2cb7a..3719a6da5 100644
--- a/packages/schemas/package.json
+++ b/packages/schemas/package.json
@@ -5,7 +5,7 @@
     "scripts": {
         "build": "yarn generate && tsc",
         "generate": "tsx tools/generate.ts",
-        "watch": "nodemon --watch ../../schemas -e json -x 'yarn generate'",
+        "watch": "nodemon --watch ../../schemas -e json -x 'yarn build'",
         "postinstall": "yarn build"
     },
     "devDependencies": {
diff --git a/packages/web/tsconfig.json b/packages/web/tsconfig.json
index f18162100..3f0e7534b 100644
--- a/packages/web/tsconfig.json
+++ b/packages/web/tsconfig.json
@@ -27,6 +27,12 @@
       ],
       "@/public/*": [
         "./public/*"
+      ],
+      "@sourcebot/schemas/v3/*": [
+        "../schemas/src/v3/*"
+      ],
+      "@sourcebot/schemas/v2/*": [
+        "../schemas/src/v2/*"
       ]
     },
     "target": "ES2017"

From a1aeb372d527ae42ffa1089f297244147044fd17 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 16:18:10 -0700
Subject: [PATCH 07/19] First pass file attachments, picker and drag and drop,
 with preview cards, wired into user message via xml-like tags similar to
 system context

---
 packages/web/package.json                     |   1 +
 .../[owner]/[repo]/components/landingPage.tsx |  19 +-
 .../src/app/(app)/chat/chatLandingPage.tsx    |   5 +-
 .../chat/components/chatLandingDropzone.tsx   |  44 +++++
 .../chat/components/landingPageChatBox.tsx    |   9 +-
 packages/web/src/ee/features/chat/agent.ts    |  13 +-
 .../chat/components/chatThread/chatThread.tsx |  17 +-
 .../chatThread/chatThreadListItem.tsx         |  44 +++--
 .../components/chatThread/detailsCard.tsx     |   1 +
 .../chatThread/messageAttachments.tsx         |  47 +++++
 .../web/src/features/chat/attachmentUtils.ts  | 164 ++++++++++++++++++
 .../components/chatBox/attachmentButton.tsx   |  54 ++++++
 .../components/chatBox/attachmentTray.tsx     |  61 +++++++
 .../chatBox/attachmentViewerDialog.tsx        |  29 ++++
 .../chat/components/chatBox/chatBox.tsx       |  83 +++++++--
 .../components/chatBox/chatPaneDropzone.tsx   |  96 ++++++++++
 .../features/chat/components/chatBox/index.ts |   3 +-
 packages/web/src/features/chat/constants.ts   |  29 ++++
 packages/web/src/features/chat/types.ts       |  20 +++
 .../features/chat/useCreateNewChatThread.ts   |   6 +-
 packages/web/src/features/chat/utils.ts       |  43 ++++-
 yarn.lock                                     |  32 +++-
 22 files changed, 767 insertions(+), 53 deletions(-)
 create mode 100644 packages/web/src/app/(app)/chat/components/chatLandingDropzone.tsx
 create mode 100644 packages/web/src/ee/features/chat/components/chatThread/messageAttachments.tsx
 create mode 100644 packages/web/src/features/chat/attachmentUtils.ts
 create mode 100644 packages/web/src/features/chat/components/chatBox/attachmentButton.tsx
 create mode 100644 packages/web/src/features/chat/components/chatBox/attachmentTray.tsx
 create mode 100644 packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx
 create mode 100644 packages/web/src/features/chat/components/chatBox/chatPaneDropzone.tsx

diff --git a/packages/web/package.json b/packages/web/package.json
index 82543adbd..928802411 100644
--- a/packages/web/package.json
+++ b/packages/web/package.json
@@ -175,6 +175,7 @@
     "react-day-picker": "^9.14.0",
     "react-device-detect": "^2.2.3",
     "react-dom": "19.2.4",
+    "react-dropzone": "^15.0.0",
     "react-hook-form": "^7.53.0",
     "react-hotkeys-hook": "^4.5.1",
     "react-icons": "^5.6.0",
diff --git a/packages/web/src/app/(app)/askgh/[owner]/[repo]/components/landingPage.tsx b/packages/web/src/app/(app)/askgh/[owner]/[repo]/components/landingPage.tsx
index f18ea5c74..afc7af4c3 100644
--- a/packages/web/src/app/(app)/askgh/[owner]/[repo]/components/landingPage.tsx
+++ b/packages/web/src/app/(app)/askgh/[owner]/[repo]/components/landingPage.tsx
@@ -3,14 +3,15 @@
 import Image from 'next/image';
 import { SearchModeSelector } from "@/app/(app)/components/searchModeSelector";
 import { Separator } from "@/components/ui/separator";
-import { ChatBox } from "@/features/chat/components/chatBox";
+import { ChatBox, ChatBoxHandle } from "@/features/chat/components/chatBox";
 import { ChatBoxToolbar } from "@/features/chat/components/chatBox/chatBoxToolbar";
+import { ChatPaneDropzone } from "@/features/chat/components/chatBox/chatPaneDropzone";
 import { NotConfiguredErrorBanner } from "@/features/chat/components/notConfiguredErrorBanner";
 import { LanguageModelInfo, RepoSearchScope } from "@/features/chat/types";
 import { useCreateNewChatThread } from "@/features/chat/useCreateNewChatThread";
 import { DISABLED_MCP_SERVER_IDS_LOCAL_STORAGE_KEY } from "@/features/chat/constants";
 import { getRepoImageSrc } from '@/lib/utils';
-import { useMemo, useState } from "react";
+import { useMemo, useRef, useState } from "react";
 import { useLocalStorage } from "usehooks-ts";
 
 interface LandingPageProps {
@@ -33,6 +34,7 @@ export const LandingPage = ({
     const { createNewChatThread, isLoading } = useCreateNewChatThread();
     const [isContextSelectorOpen, setIsContextSelectorOpen] = useState(false);
     const [disabledMcpServerIds, setDisabledMcpServerIds] = useLocalStorage<string[]>(DISABLED_MCP_SERVER_IDS_LOCAL_STORAGE_KEY, [], { initializeWithValue: false });
+    const chatBoxRef = useRef<ChatBoxHandle>(null);
     const isChatBoxDisabled = languageModels.length === 0;
 
     const selectedSearchScopes = useMemo(() => [
@@ -67,11 +69,16 @@ export const LandingPage = ({
                 </div>
 
                 {/* ChatBox */}
-                <div className="w-full">
+                <ChatPaneDropzone
+                    className="w-full"
+                    onFilesDropped={(files) => chatBoxRef.current?.addFiles(files)}
+                    disabled={isChatBoxDisabled}
+                >
                     <div className="border rounded-md w-full shadow-sm">
                         <ChatBox
-                            onSubmit={(children) => {
-                                createNewChatThread(children, selectedSearchScopes, disabledMcpServerIds);
+                            ref={chatBoxRef}
+                            onSubmit={(children, _editor, attachments) => {
+                                createNewChatThread(children, selectedSearchScopes, disabledMcpServerIds, attachments);
                             }}
                             className="min-h-[50px]"
                             isRedirecting={isLoading}
@@ -107,7 +114,7 @@ export const LandingPage = ({
                     {isChatBoxDisabled && (
                         <NotConfiguredErrorBanner className="mt-4" />
                     )}
-                </div>
+                </ChatPaneDropzone>
             </div>
         </div>
     )
diff --git a/packages/web/src/app/(app)/chat/chatLandingPage.tsx b/packages/web/src/app/(app)/chat/chatLandingPage.tsx
index 5bd84a3d0..f81e27247 100644
--- a/packages/web/src/app/(app)/chat/chatLandingPage.tsx
+++ b/packages/web/src/app/(app)/chat/chatLandingPage.tsx
@@ -5,6 +5,7 @@ import { CustomSlateEditor } from "@/features/chat/customSlateEditor";
 import { ServiceErrorException } from "@/lib/serviceError";
 import { isServiceError, measure } from "@/lib/utils";
 import { LandingPageChatBox } from "./components/landingPageChatBox";
+import { ChatLandingDropzone } from "./components/chatLandingDropzone";
 import { RepositoryCarousel } from "../components/repositoryCarousel";
 import { Separator } from "@/components/ui/separator";
 import { DemoCards } from "./components/demoCards";
@@ -56,7 +57,7 @@ export async function ChatLandingPage() {
     })() : undefined;
 
     return (
-        <div className="flex flex-col items-center h-full overflow-hidden">
+        <ChatLandingDropzone disabled={languageModels.length === 0}>
                 <div className="flex flex-col items-center h-full overflow-y-auto pt-8 pb-8 md:pt-16 w-full px-5">
                     <div className="max-h-44 w-auto">
                         <SourcebotLogo
@@ -92,6 +93,6 @@ export async function ChatLandingPage() {
                         </>
                     )}
                 </div>
-        </div>
+        </ChatLandingDropzone>
     )
 }
diff --git a/packages/web/src/app/(app)/chat/components/chatLandingDropzone.tsx b/packages/web/src/app/(app)/chat/components/chatLandingDropzone.tsx
new file mode 100644
index 000000000..821b3a5ba
--- /dev/null
+++ b/packages/web/src/app/(app)/chat/components/chatLandingDropzone.tsx
@@ -0,0 +1,44 @@
+'use client';
+
+import { ChatBoxHandle } from "@/features/chat/components/chatBox";
+import { ChatPaneDropzone } from "@/features/chat/components/chatBox/chatPaneDropzone";
+import { createContext, ReactNode, useCallback, useContext, useRef } from "react";
+
+type RegisterChatBoxHandle = (handle: ChatBoxHandle | null) => void;
+
+const LandingChatBoxContext = createContext<RegisterChatBoxHandle | null>(null);
+
+// Lets the (nested) landing chat box register its imperative handle so the
+// pane-level drop zone can forward dropped files into it. Returns a no-op when
+// rendered outside the provider.
+export const useRegisterLandingChatBox = (): RegisterChatBoxHandle => {
+    return useContext(LandingChatBoxContext) ?? (() => { });
+}
+
+interface ChatLandingDropzoneProps {
+    disabled?: boolean;
+    children: ReactNode;
+}
+
+// Wraps the entire unstarted-chat landing pane in a drag-and-drop target.
+// The chat box lives deeper in the tree (and behind a server/client boundary),
+// so it registers its handle via context rather than a direct ref.
+export const ChatLandingDropzone = ({ disabled, children }: ChatLandingDropzoneProps) => {
+    const handleRef = useRef<ChatBoxHandle | null>(null);
+
+    const register = useCallback<RegisterChatBoxHandle>((handle) => {
+        handleRef.current = handle;
+    }, []);
+
+    return (
+        <LandingChatBoxContext.Provider value={register}>
+            <ChatPaneDropzone
+                className="flex flex-col items-center h-full overflow-hidden"
+                onFilesDropped={(files) => handleRef.current?.addFiles(files)}
+                disabled={disabled}
+            >
+                {children}
+            </ChatPaneDropzone>
+        </LandingChatBoxContext.Provider>
+    )
+}
diff --git a/packages/web/src/app/(app)/chat/components/landingPageChatBox.tsx b/packages/web/src/app/(app)/chat/components/landingPageChatBox.tsx
index ed749450f..61f78bf4d 100644
--- a/packages/web/src/app/(app)/chat/components/landingPageChatBox.tsx
+++ b/packages/web/src/app/(app)/chat/components/landingPageChatBox.tsx
@@ -7,6 +7,7 @@ import { LanguageModelInfo, SearchScope } from "@/features/chat/types";
 import { useCreateNewChatThread } from "@/features/chat/useCreateNewChatThread";
 import { RepositoryQuery, SearchContextQuery } from "@/lib/types";
 import { useState } from "react";
+import { useRegisterLandingChatBox } from "./chatLandingDropzone";
 import { useLocalStorage } from "usehooks-ts";
 import { DISABLED_MCP_SERVER_IDS_LOCAL_STORAGE_KEY, SELECTED_SEARCH_SCOPES_LOCAL_STORAGE_KEY } from "@/features/chat/constants";
 import { SearchModeSelector } from "../../components/searchModeSelector";
@@ -31,14 +32,16 @@ export const LandingPageChatBox = ({
     const [selectedSearchScopes, setSelectedSearchScopes] = useLocalStorage<SearchScope[]>(SELECTED_SEARCH_SCOPES_LOCAL_STORAGE_KEY, [], { initializeWithValue: false });
     const [disabledMcpServerIds, setDisabledMcpServerIds] = useLocalStorage<string[]>(DISABLED_MCP_SERVER_IDS_LOCAL_STORAGE_KEY, [], { initializeWithValue: false });
     const [isContextSelectorOpen, setIsContextSelectorOpen] = useState(false);
+    const registerChatBox = useRegisterLandingChatBox();
     const isChatBoxDisabled = languageModels.length === 0;
 
     return (
         <div className="w-full max-w-[800px] mt-4">
             <div className="border rounded-md w-full shadow-sm">
                 <ChatBox
-                    onSubmit={(children) => {
-                        createNewChatThread(children, selectedSearchScopes, disabledMcpServerIds);
+                    ref={registerChatBox}
+                    onSubmit={(children, _editor, attachments) => {
+                        createNewChatThread(children, selectedSearchScopes, disabledMcpServerIds, attachments);
                     }}
                     className="min-h-[50px]"
                     isRedirecting={isLoading}
@@ -74,6 +77,6 @@ export const LandingPageChatBox = ({
             {isChatBoxDisabled && (
                 <NotConfiguredErrorBanner className="mt-4" />
             )}
-        </div >
+        </div>
     )
 }
diff --git a/packages/web/src/ee/features/chat/agent.ts b/packages/web/src/ee/features/chat/agent.ts
index 3a300a08b..abbafc9a9 100644
--- a/packages/web/src/ee/features/chat/agent.ts
+++ b/packages/web/src/ee/features/chat/agent.ts
@@ -22,7 +22,8 @@ import { randomUUID } from "crypto";
 import _dedent from "dedent";
 import { ANSWER_TAG, FILE_REFERENCE_PREFIX } from "@/features/chat/constants";
 import { Source } from "@/features/chat/types";
-import { addLineNumbers, fileReferenceToString, getAnswerPartFromAssistantMessage, getTurnProgressState, getUserMessageText } from "@/features/chat/utils";
+import { addLineNumbers, fileReferenceToString, formatAttachmentsForPrompt, getAnswerPartFromAssistantMessage, getTurnProgressState, getUserMessageAttachments, getUserMessageText } from "@/features/chat/utils";
+import { ATTACHMENT_MAX_TEXT_BYTES } from "@/features/chat/constants";
 import { createTools } from "./tools";
 import { getConnectedMcpClients } from "@/ee/features/chat/mcp/mcpClientFactory";
 import { getMcpTools, McpToolsResult } from "@/ee/features/chat/mcp/mcpToolSets";
@@ -105,9 +106,17 @@ export const createMessageStream = async ({
     let messageHistory: ModelMessage[] =
         messages.map((message, index): ModelMessage | undefined => {
             if (message.role === 'user') {
+                // Fold any inline-text attachments into this turn's content (not
+                // the system prompt) so they stay bound to the turn they were
+                // attached to and are re-emitted per turn from the persisted parts.
+                const text = getUserMessageText(message);
+                const attachmentsBlock = formatAttachmentsForPrompt(
+                    getUserMessageAttachments(message),
+                    ATTACHMENT_MAX_TEXT_BYTES,
+                );
                 return {
                     role: 'user',
-                    content: getUserMessageText(message),
+                    content: attachmentsBlock ? `${text}\n\n${attachmentsBlock}` : text,
                 };
             }
 
diff --git a/packages/web/src/ee/features/chat/components/chatThread/chatThread.tsx b/packages/web/src/ee/features/chat/components/chatThread/chatThread.tsx
index 87faf79f8..a7dfa82af 100644
--- a/packages/web/src/ee/features/chat/components/chatThread/chatThread.tsx
+++ b/packages/web/src/ee/features/chat/components/chatThread/chatThread.tsx
@@ -4,7 +4,7 @@ import { useToast } from '@/components/hooks/use-toast';
 import { Button } from '@/components/ui/button';
 import { Separator } from '@/components/ui/separator';
 import { CustomSlateEditor } from '@/features/chat/customSlateEditor';
-import { AdditionalChatRequestParams, CustomEditor, LanguageModelInfo, SBChatMessage, SearchScope, Source } from '@/features/chat/types';
+import { AdditionalChatRequestParams, AttachmentData, CustomEditor, LanguageModelInfo, SBChatMessage, SearchScope, Source } from '@/features/chat/types';
 import { createUIMessage, getAllMentionElements, getTurnProgressState, getUserMessageText, resetEditor, slateContentToString } from '@/features/chat/utils';
 import { useChat } from '@ai-sdk/react';
 import { CreateUIMessage, DefaultChatTransport, lastAssistantMessageIsCompleteWithApprovalResponses } from 'ai';
@@ -15,8 +15,9 @@ import { useStickToBottom } from 'use-stick-to-bottom';
 import { Descendant } from 'slate';
 import { useMessagePairs } from '../../useMessagePairs';
 import { useSelectedLanguageModel } from '@/features/chat/useSelectedLanguageModel';
-import { ChatBox } from '@/features/chat/components/chatBox';
+import { ChatBox, ChatBoxHandle } from '@/features/chat/components/chatBox';
 import { ChatBoxToolbar } from '@/features/chat/components/chatBox/chatBoxToolbar';
+import { ChatPaneDropzone } from '@/features/chat/components/chatBox/chatPaneDropzone';
 import { ChatThreadListItem } from './chatThreadListItem';
 import { ErrorBanner } from './errorBanner';
 import { McpFailedServersBanner } from './mcpFailedServersBanner';
@@ -72,6 +73,7 @@ export const ChatThread = ({
 }: ChatThreadProps) => {
     const [isErrorBannerVisible, setIsErrorBannerVisible] = useState(false);
     const hasSubmittedInputMessage = useRef(false);
+    const chatBoxRef = useRef<ChatBoxHandle>(null);
     const { scrollRef, contentRef, scrollToBottom, isAtBottom } = useStickToBottom({ initial: false });
     const { toast } = useToast();
     const router = useRouter();
@@ -347,11 +349,11 @@ export const ChatThread = ({
         }
     }, [error]);
 
-    const onSubmit = useCallback(async (children: Descendant[], editor: CustomEditor) => {
+    const onSubmit = useCallback(async (children: Descendant[], editor: CustomEditor, attachments: AttachmentData[]) => {
         const text = slateContentToString(children);
         const mentions = getAllMentionElements(children);
 
-        const message = createUIMessage(text, mentions.map(({ data }) => data), selectedSearchScopes, disabledMcpServerIds);
+        const message = createUIMessage(text, mentions.map(({ data }) => data), selectedSearchScopes, disabledMcpServerIds, attachments);
         sendMessage(message);
 
         scrollToBottom();
@@ -381,6 +383,11 @@ export const ChatThread = ({
     return (
         <ToolApprovalProvider value={addToolApprovalResponse}>
         <McpServerIconContext.Provider value={mcpServerIconMap}>
+        <ChatPaneDropzone
+            className="flex flex-col flex-1 min-h-0 w-full"
+            onFilesDropped={(files) => chatBoxRef.current?.addFiles(files)}
+            disabled={!isOwner || languageModels.length === 0}
+        >
             {error && (
                 <ErrorBanner
                     error={error}
@@ -470,6 +477,7 @@ export const ChatThread = ({
                         <div className="border rounded-md w-full shadow-sm">
                             <CustomSlateEditor>
                                 <ChatBox
+                                    ref={chatBoxRef}
                                     onSubmit={onSubmit}
                                     className="min-h-[80px]"
                                     preferredSuggestionsBoxPlacement="top-start"
@@ -520,6 +528,7 @@ export const ChatThread = ({
                     </div>
                 )}
             </div>
+        </ChatPaneDropzone>
         </McpServerIconContext.Provider>
         </ToolApprovalProvider>
     );
diff --git a/packages/web/src/ee/features/chat/components/chatThread/chatThreadListItem.tsx b/packages/web/src/ee/features/chat/components/chatThread/chatThreadListItem.tsx
index 6b79de0e6..b296dde53 100644
--- a/packages/web/src/ee/features/chat/components/chatThread/chatThreadListItem.tsx
+++ b/packages/web/src/ee/features/chat/components/chatThread/chatThreadListItem.tsx
@@ -8,8 +8,9 @@ import { CSSProperties, forwardRef, memo, useCallback, useEffect, useMemo, useRe
 import scrollIntoView from 'scroll-into-view-if-needed';
 import { Reference, referenceSchema, SBChatMessage, Source } from "@/features/chat/types";
 import { useExtractReferences } from '../../useExtractReferences';
-import { getAnswerPartFromAssistantMessage, getLastStepParts, getUserMessageText, groupMessageIntoSteps, isSBChatToolPart, repairReferences, tryResolveFileReference } from '@/features/chat/utils';
+import { getAnswerPartFromAssistantMessage, getLastStepParts, getUserMessageAttachments, getUserMessageText, groupMessageIntoSteps, isSBChatToolPart, repairReferences, tryResolveFileReference } from '@/features/chat/utils';
 import { AnswerCard } from './answerCard';
+import { MessageAttachments } from './messageAttachments';
 import { DetailsCard } from './detailsCard';
 import { ApprovalRequestedToolPart, ToolApprovalBanner } from './toolApprovalBanner';
 import { MarkdownRenderer, REFERENCE_PAYLOAD_ATTRIBUTE } from './markdownRenderer';
@@ -52,6 +53,10 @@ const ChatThreadListItemComponent = forwardRef<HTMLDivElement, ChatThreadListIte
         return getUserMessageText(userMessage);
     }, [userMessage]);
 
+    const userAttachments = useMemo(() => {
+        return getUserMessageAttachments(userMessage);
+    }, [userMessage]);
+
     // Take the assistant message and repair any references that are not properly formatted.
     // This applies to parts that are text (i.e., text & reasoning).
     const assistantMessage = useMemo(() => {
@@ -370,27 +375,30 @@ const ChatThreadListItemComponent = forwardRef<HTMLDivElement, ChatThreadListIte
                         ref={leftPanelRef}
                         className="py-4 h-full"
                     >
-                        <div className="flex flex-row gap-2 mb-4">
-                            {isTurnInProgress ? (
-                                <Loader2 className="w-4 h-4 animate-spin flex-shrink-0 mt-1.5" />
-                            ) : (
-                                <CheckCircle className="w-4 h-4 text-green-700 flex-shrink-0 mt-1.5" />
+                        <div className="mb-4">
+                            {userAttachments.length > 0 && (
+                                <MessageAttachments attachments={userAttachments} className="mb-1.5 ml-6" />
                             )}
-                            <MarkdownRenderer
-                                content={userQuestion.trim()}
-                                className="prose-p:m-0"
-                                escapeHtml={true}
-                            />
+
+                            <div className="flex flex-row gap-2">
+                                {isTurnInProgress ? (
+                                    <Loader2 className="w-4 h-4 animate-spin flex-shrink-0 mt-1.5" />
+                                ) : (
+                                    <CheckCircle className="w-4 h-4 text-green-700 flex-shrink-0 mt-1.5" />
+                                )}
+                                <MarkdownRenderer
+                                    content={userQuestion.trim()}
+                                    className="prose-p:m-0"
+                                    escapeHtml={true}
+                                />
+                            </div>
                         </div>
 
                         {isThinking && (
-                            <div className="space-y-4 mb-4">
-                                <Skeleton className="h-4 max-w-32" />
-                                <div className="space-y-2">
-                                    <Skeleton className="h-3 max-w-72" />
-                                    <Skeleton className="h-3 max-w-64" />
-                                    <Skeleton className="h-3 max-w-56" />
-                                </div>
+                            <div className="space-y-2 mb-4">
+                                <Skeleton className="h-3 w-full max-w-80" />
+                                <Skeleton className="h-3 w-full max-w-72" />
+                                <Skeleton className="h-3 w-full max-w-56" />
                             </div>
                         )}
 
diff --git a/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx b/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx
index 7b6c7867f..c705b6be5 100644
--- a/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx
+++ b/packages/web/src/ee/features/chat/components/chatThread/detailsCard.tsx
@@ -537,6 +537,7 @@ export const StepPartRenderer = ({ part, toolTokenUsageMap }: { part: SBChatMess
         case 'data-source':
         case 'data-mcp-server':
         case 'data-mcp-failed-server':
+        case 'data-attachment':
         case 'file':
         case 'source-document':
         case 'source-url':
diff --git a/packages/web/src/ee/features/chat/components/chatThread/messageAttachments.tsx b/packages/web/src/ee/features/chat/components/chatThread/messageAttachments.tsx
new file mode 100644
index 000000000..7d2b5040e
--- /dev/null
+++ b/packages/web/src/ee/features/chat/components/chatThread/messageAttachments.tsx
@@ -0,0 +1,47 @@
+'use client';
+
+import { VscodeFileIcon } from "@/app/components/vscodeFileIcon";
+import { AttachmentViewerDialog } from "@/features/chat/components/chatBox/attachmentViewerDialog";
+import { AttachmentData } from "@/features/chat/types";
+import { cn } from "@/lib/utils";
+import { useState } from "react";
+
+interface MessageAttachmentsProps {
+    attachments: AttachmentData[];
+    className?: string;
+}
+
+export const MessageAttachments = ({ attachments, className }: MessageAttachmentsProps) => {
+    const [activeAttachment, setActiveAttachment] = useState<AttachmentData | null>(null);
+
+    if (attachments.length === 0) {
+        return null;
+    }
+
+    return (
+        <>
+            <div className={cn("flex flex-row flex-wrap gap-1.5", className)}>
+                {attachments.map((attachment, index) => (
+                    <button
+                        key={`${attachment.filename}-${index}`}
+                        type="button"
+                        onClick={() => setActiveAttachment(attachment)}
+                        className="flex flex-row items-center gap-1 rounded bg-muted px-1.5 py-0.5 text-xs hover:bg-accent transition-colors"
+                        title={`View ${attachment.filename}`}
+                    >
+                        <VscodeFileIcon fileName={attachment.filename} className="w-3 h-3" />
+                        <span className="font-mono max-w-[160px] truncate">
+                            {attachment.filename}
+                        </span>
+                    </button>
+                ))}
+            </div>
+            <AttachmentViewerDialog
+                open={activeAttachment !== null}
+                onOpenChange={(open) => !open && setActiveAttachment(null)}
+                filename={activeAttachment?.filename}
+                text={activeAttachment?.kind === 'text' ? activeAttachment.text : undefined}
+            />
+        </>
+    )
+}
diff --git a/packages/web/src/features/chat/attachmentUtils.ts b/packages/web/src/features/chat/attachmentUtils.ts
new file mode 100644
index 000000000..75828cb32
--- /dev/null
+++ b/packages/web/src/features/chat/attachmentUtils.ts
@@ -0,0 +1,164 @@
+'use client';
+
+import {
+    ATTACHMENT_ALLOWED_TEXT_EXTENSIONS,
+    ATTACHMENT_ALLOWED_TEXT_MIME_TYPES,
+    ATTACHMENT_MAX_COUNT,
+    ATTACHMENT_MAX_FILENAME_LENGTH,
+    ATTACHMENT_MAX_TEXT_BYTES,
+} from "./constants";
+import { AttachmentData, TextAttachment } from "./types";
+
+// Normalizes an untrusted filename: keeps only the basename, drops control
+// characters (which could break the prompt's `<attachment filename="...">` tag
+// or the UI), collapses whitespace, and caps the length while preserving the
+// extension. Long/abusive names are truncated rather than rejected.
+export const sanitizeFilename = (name: string): string => {
+    const basename = name.split(/[\\/]/).pop() ?? name;
+    const cleaned = Array.from(basename)
+        .filter((char) => {
+            const code = char.charCodeAt(0);
+            return code >= 32 && code !== 127;
+        })
+        .join('')
+        .replace(/\s+/g, ' ')
+        .trim() || 'attachment';
+
+    if (cleaned.length <= ATTACHMENT_MAX_FILENAME_LENGTH) {
+        return cleaned;
+    }
+
+    const dotIndex = cleaned.lastIndexOf('.');
+    const extension = dotIndex > 0 ? cleaned.slice(dotIndex) : '';
+    const stem = dotIndex > 0 ? cleaned.slice(0, dotIndex) : cleaned;
+    const keep = Math.max(1, ATTACHMENT_MAX_FILENAME_LENGTH - extension.length - 1);
+    return `${stem.slice(0, keep)}…${extension}`;
+}
+
+// A text attachment selected in the chat box but not yet submitted. The `id`
+// is a client-only key for list rendering and removal; it is stripped before
+// the attachment becomes part of a message.
+export type PendingAttachment = TextAttachment & { id: string };
+
+// Builds the comma-separated `accept` attribute for a native `<input type=file>`
+// so the OS picker only surfaces supported text file types.
+export const getAttachmentAcceptAttribute = (): string => {
+    return [
+        'text/*',
+        ...ATTACHMENT_ALLOWED_TEXT_MIME_TYPES,
+        ...ATTACHMENT_ALLOWED_TEXT_EXTENSIONS.map((extension) => `.${extension}`),
+    ].join(',');
+}
+
+// Builds the `accept` map for react-dropzone (and the native file picker) so
+// the OS dialog and drag overlay only surface supported text file types. The
+// extension list is attached to `text/plain` so code files that report an empty
+// or unusual MIME type are still selectable by extension.
+export const getAttachmentDropzoneAccept = (): Record<string, string[]> => {
+    const accept: Record<string, string[]> = {
+        'text/*': [],
+        'text/plain': ATTACHMENT_ALLOWED_TEXT_EXTENSIONS.map((extension) => `.${extension}`),
+    };
+    for (const mimeType of ATTACHMENT_ALLOWED_TEXT_MIME_TYPES) {
+        accept[mimeType] = [];
+    }
+    return accept;
+}
+
+export const toAttachmentData = (attachment: PendingAttachment): AttachmentData => {
+    return {
+        kind: attachment.kind,
+        filename: attachment.filename,
+        mediaType: attachment.mediaType,
+        sizeBytes: attachment.sizeBytes,
+        text: attachment.text,
+    };
+}
+
+const getExtension = (filename: string): string => {
+    const parts = filename.toLowerCase().split('.');
+    return parts.length > 1 ? (parts[parts.length - 1] ?? '') : '';
+}
+
+export const isAllowedTextFile = (file: File): boolean => {
+    if (file.type.startsWith('text/')) {
+        return true;
+    }
+    if (ATTACHMENT_ALLOWED_TEXT_MIME_TYPES.includes(file.type)) {
+        return true;
+    }
+
+    const extension = getExtension(file.name);
+    if (ATTACHMENT_ALLOWED_TEXT_EXTENSIONS.includes(extension)) {
+        return true;
+    }
+
+    // Files with no extension (e.g. "Dockerfile") report an empty extension;
+    // fall back to matching the whole lowercased filename.
+    const nameLower = file.name.toLowerCase();
+    if (ATTACHMENT_ALLOWED_TEXT_EXTENSIONS.includes(nameLower)) {
+        return true;
+    }
+
+    return false;
+}
+
+const readAsText = (file: File): Promise<string> => {
+    return new Promise((resolve, reject) => {
+        const reader = new FileReader();
+        reader.onerror = () => reject(reader.error ?? new Error('Failed to read file'));
+        reader.onload = () => resolve(typeof reader.result === 'string' ? reader.result : '');
+        reader.readAsText(file);
+    });
+}
+
+export type ReadFilesResult = {
+    attachments: PendingAttachment[];
+    errors: string[];
+};
+
+// Reads and validates a set of files into pending text attachments, enforcing
+// the per-message count, per-file size, and allowed-type caps. Rejected files
+// produce a human-readable error message instead of throwing.
+export const readFilesAsAttachments = async (
+    files: File[],
+    existingCount: number,
+): Promise<ReadFilesResult> => {
+    const attachments: PendingAttachment[] = [];
+    const errors: string[] = [];
+    let count = existingCount;
+
+    for (const file of files) {
+        if (count >= ATTACHMENT_MAX_COUNT) {
+            errors.push(`You can attach at most ${ATTACHMENT_MAX_COUNT} files per message.`);
+            break;
+        }
+
+        if (!isAllowedTextFile(file)) {
+            errors.push(`${file.name}: unsupported file type (text files only).`);
+            continue;
+        }
+
+        if (file.size > ATTACHMENT_MAX_TEXT_BYTES) {
+            errors.push(`${file.name}: exceeds the ${Math.round(ATTACHMENT_MAX_TEXT_BYTES / 1024)}KB limit.`);
+            continue;
+        }
+
+        try {
+            const text = await readAsText(file);
+            attachments.push({
+                id: crypto.randomUUID(),
+                kind: 'text',
+                filename: sanitizeFilename(file.name),
+                mediaType: file.type || 'text/plain',
+                sizeBytes: file.size,
+                text,
+            });
+            count++;
+        } catch {
+            errors.push(`${file.name}: failed to read file.`);
+        }
+    }
+
+    return { attachments, errors };
+}
diff --git a/packages/web/src/features/chat/components/chatBox/attachmentButton.tsx b/packages/web/src/features/chat/components/chatBox/attachmentButton.tsx
new file mode 100644
index 000000000..fef235c06
--- /dev/null
+++ b/packages/web/src/features/chat/components/chatBox/attachmentButton.tsx
@@ -0,0 +1,54 @@
+'use client';
+
+import { Button } from "@/components/ui/button";
+import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
+import { getAttachmentAcceptAttribute } from "@/features/chat/attachmentUtils";
+import { Paperclip } from "lucide-react";
+import { useRef } from "react";
+
+interface AttachmentButtonProps {
+    onAddFiles: (files: File[]) => void;
+    disabled?: boolean;
+}
+
+export const AttachmentButton = ({ onAddFiles, disabled }: AttachmentButtonProps) => {
+    const inputRef = useRef<HTMLInputElement>(null);
+
+    return (
+        <>
+            <input
+                ref={inputRef}
+                type="file"
+                multiple
+                accept={getAttachmentAcceptAttribute()}
+                className="hidden"
+                onChange={(e) => {
+                    const files = e.target.files ? Array.from(e.target.files) : [];
+                    if (files.length > 0) {
+                        onAddFiles(files);
+                    }
+                    // Reset so selecting the same file again re-triggers onChange.
+                    e.target.value = '';
+                }}
+            />
+            <Tooltip>
+                <TooltipTrigger asChild>
+                    <Button
+                        type="button"
+                        variant="ghost"
+                        size="icon"
+                        className="w-6 h-6 text-muted-foreground hover:text-foreground"
+                        disabled={disabled}
+                        onClick={() => inputRef.current?.click()}
+                        aria-label="Attach files"
+                    >
+                        <Paperclip className="w-4 h-4" />
+                    </Button>
+                </TooltipTrigger>
+                <TooltipContent>
+                    Attach text files
+                </TooltipContent>
+            </Tooltip>
+        </>
+    )
+}
diff --git a/packages/web/src/features/chat/components/chatBox/attachmentTray.tsx b/packages/web/src/features/chat/components/chatBox/attachmentTray.tsx
new file mode 100644
index 000000000..090320e92
--- /dev/null
+++ b/packages/web/src/features/chat/components/chatBox/attachmentTray.tsx
@@ -0,0 +1,61 @@
+'use client';
+
+import { VscodeFileIcon } from "@/app/components/vscodeFileIcon";
+import { cn } from "@/lib/utils";
+import { X } from "lucide-react";
+import { useState } from "react";
+import { PendingAttachment } from "../../attachmentUtils";
+import { AttachmentViewerDialog } from "./attachmentViewerDialog";
+
+interface AttachmentTrayProps {
+    attachments: PendingAttachment[];
+    onRemove: (id: string) => void;
+    className?: string;
+}
+
+export const AttachmentTray = ({ attachments, onRemove, className }: AttachmentTrayProps) => {
+    const [activeAttachment, setActiveAttachment] = useState<PendingAttachment | null>(null);
+
+    if (attachments.length === 0) {
+        return null;
+    }
+
+    return (
+        <>
+            <div className={cn("flex flex-row flex-wrap gap-1.5", className)}>
+                {attachments.map((attachment) => (
+                    <div
+                        key={attachment.id}
+                        className="flex flex-row items-center gap-1 rounded bg-muted px-1.5 py-0.5 text-xs"
+                    >
+                        <button
+                            type="button"
+                            onClick={() => setActiveAttachment(attachment)}
+                            className="flex flex-row items-center gap-1 hover:text-foreground"
+                            title={`View ${attachment.filename}`}
+                        >
+                            <VscodeFileIcon fileName={attachment.filename} className="w-3 h-3" />
+                            <span className="font-mono max-w-[160px] truncate">
+                                {attachment.filename}
+                            </span>
+                        </button>
+                        <button
+                            type="button"
+                            onClick={() => onRemove(attachment.id)}
+                            className="text-muted-foreground hover:text-foreground"
+                            aria-label={`Remove ${attachment.filename}`}
+                        >
+                            <X className="w-3 h-3" />
+                        </button>
+                    </div>
+                ))}
+            </div>
+            <AttachmentViewerDialog
+                open={activeAttachment !== null}
+                onOpenChange={(open) => !open && setActiveAttachment(null)}
+                filename={activeAttachment?.filename}
+                text={activeAttachment?.text}
+            />
+        </>
+    )
+}
diff --git a/packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx b/packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx
new file mode 100644
index 000000000..588a142df
--- /dev/null
+++ b/packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx
@@ -0,0 +1,29 @@
+'use client';
+
+import { Dialog, DialogContent, DialogHeader, DialogTitle } from "@/components/ui/dialog";
+
+interface AttachmentViewerDialogProps {
+    filename?: string;
+    text?: string;
+    open: boolean;
+    onOpenChange: (open: boolean) => void;
+}
+
+// Shared viewer for inspecting an inline-text attachment's contents. Used for
+// both staged (not-yet-sent) and sent attachments.
+export const AttachmentViewerDialog = ({ filename, text, open, onOpenChange }: AttachmentViewerDialogProps) => {
+    return (
+        <Dialog open={open} onOpenChange={onOpenChange}>
+            <DialogContent className="max-w-3xl">
+                <DialogHeader>
+                    <DialogTitle className="font-mono text-sm break-all">
+                        {filename}
+                    </DialogTitle>
+                </DialogHeader>
+                <pre className="max-h-[60vh] overflow-auto rounded bg-muted p-3 text-xs whitespace-pre-wrap break-words">
+                    {text}
+                </pre>
+            </DialogContent>
+        </Dialog>
+    )
+}
diff --git a/packages/web/src/features/chat/components/chatBox/chatBox.tsx b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
index e405e8266..e8a624cb8 100644
--- a/packages/web/src/features/chat/components/chatBox/chatBox.tsx
+++ b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
@@ -3,13 +3,16 @@
 import { VscodeFileIcon } from "@/app/components/vscodeFileIcon";
 import { Button } from "@/components/ui/button";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
-import { CustomEditor, MentionElement, RenderElementPropsFor, SearchScope } from "@/features/chat/types";
+import { AttachmentData, CustomEditor, MentionElement, RenderElementPropsFor, SearchScope } from "@/features/chat/types";
 import { insertMention, slateContentToString } from "@/features/chat/utils";
+import { PendingAttachment, readFilesAsAttachments, toAttachmentData } from "@/features/chat/attachmentUtils";
+import { AttachmentButton } from "./attachmentButton";
+import { AttachmentTray } from "./attachmentTray";
 import { cn } from "@/lib/utils";
 import { useIsMac } from "@/hooks/useIsMac";
 import { computePosition, flip, offset, shift, VirtualElement } from "@floating-ui/react";
 import { ArrowUp, Loader2, StopCircleIcon } from "lucide-react";
-import { Fragment, KeyboardEvent, memo, useCallback, useEffect, useMemo, useRef, useState } from "react";
+import { forwardRef, Fragment, KeyboardEvent, memo, Ref, useCallback, useEffect, useImperativeHandle, useMemo, useRef, useState } from "react";
 import { useHotkeys } from "react-hotkeys-hook";
 import { Descendant, insertText } from "slate";
 import { Editable, ReactEditor, RenderElementProps, RenderLeafProps, useFocused, useSelected, useSlate } from "slate-react";
@@ -28,8 +31,12 @@ import useCaptureEvent from "@/hooks/useCaptureEvent";
 import { useHasEntitlement } from "@/features/entitlements/useHasEntitlement";
 import { UpsellDialog } from "@/features/billing/upsellDialog";
 
+export interface ChatBoxHandle {
+    addFiles: (files: File[]) => void;
+}
+
 interface ChatBoxProps {
-    onSubmit: (children: Descendant[], editor: CustomEditor) => void;
+    onSubmit: (children: Descendant[], editor: CustomEditor, attachments: AttachmentData[]) => void;
     onStop?: () => void;
     preferredSuggestionsBoxPlacement?: "top-start" | "bottom-start";
     className?: string;
@@ -56,7 +63,7 @@ const ChatBoxComponent = ({
     isAuthenticated,
     selectedSearchScopes,
     searchContexts,
-}: ChatBoxProps) => {
+}: ChatBoxProps, ref: Ref<ChatBoxHandle>) => {
     const suggestionsBoxRef = useRef<HTMLDivElement>(null);
     const [index, setIndex] = useState(0);
     const editor = useSlate();
@@ -85,8 +92,41 @@ const ChatBoxComponent = ({
     const isAskEnabled = useHasEntitlement('ask');
     const [isLoginDialogOpen, setIsLoginDialogOpen] = useState<boolean>(false);
     const [isUpsellDialogOpen, setIsUpsellDialogOpen] = useState<boolean>(false);
+    const [attachments, setAttachments] = useState<PendingAttachment[]>([]);
     const pathname = usePathname();
 
+    const onAddFiles = useCallback(async (files: File[]) => {
+        if (files.length === 0) {
+            return;
+        }
+
+        const { attachments: added, errors } = await readFilesAsAttachments(files, attachments.length);
+        if (added.length > 0) {
+            setAttachments((prev) => [...prev, ...added]);
+        }
+        if (errors.length > 0) {
+            toast({
+                description: `⚠️ ${errors.join(' ')}`,
+                variant: "destructive",
+            });
+        }
+
+        // Return focus to the prompt input so the user can keep typing.
+        ReactEditor.focus(editor);
+    }, [attachments.length, toast, editor]);
+
+    const removeAttachment = useCallback((id: string) => {
+        setAttachments((prev) => prev.filter((attachment) => attachment.id !== id));
+    }, []);
+
+    // Allow an ancestor pane-level drop zone to forward dropped files into this
+    // chat box (which owns attachment state). See `ChatPaneDropzone`.
+    useImperativeHandle(ref, () => ({
+        addFiles: (files: File[]) => {
+            void onAddFiles(files);
+        },
+    }), [onAddFiles]);
+
     // Reset the index when the suggestion mode changes.
     useEffect(() => {
         setIndex(0);
@@ -123,7 +163,7 @@ const ChatBoxComponent = ({
         isSubmitDisabled: false,
         isSubmitDisabledReason: undefined,
     } => {
-        if (slateContentToString(editor.children).trim().length === 0) {
+        if (slateContentToString(editor.children).trim().length === 0 && attachments.length === 0) {
             return {
                 isSubmitDisabled: true,
                 isSubmitDisabledReason: "empty",
@@ -157,7 +197,7 @@ const ChatBoxComponent = ({
             isSubmitDisabledReason: undefined,
         }
 
-    }, [editor.children, isRedirecting, isTurnInProgress, selectedLanguageModel])
+    }, [editor.children, isRedirecting, isTurnInProgress, selectedLanguageModel, attachments.length])
 
     const {
         requiresLogin,
@@ -202,7 +242,8 @@ const ChatBoxComponent = ({
             return;
         }
 
-        _onSubmit(editor.children, editor);
+        _onSubmit(editor.children, editor, attachments.map(toAttachmentData));
+        setAttachments([]);
     }, [
         isSubmitDisabled,
         requiresLogin,
@@ -212,7 +253,8 @@ const ChatBoxComponent = ({
         isSubmitDisabledReason,
         toast,
         pathname,
-        captureEvent
+        captureEvent,
+        attachments
     ]);
 
     useEffect(() => {
@@ -235,7 +277,8 @@ const ChatBoxComponent = ({
             }
 
             sessionStorage.removeItem(PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY);
-            _onSubmit(children, editor);
+            // Attachments are not persisted across the login/upgrade redirect.
+            _onSubmit(children, editor, []);
         } catch (error) {
             console.error('Failed to restore pending chat submission:', error);
             sessionStorage.removeItem(PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY);
@@ -364,6 +407,13 @@ const ChatBoxComponent = ({
             <div
                 className={cn("flex flex-col justify-between gap-0.5 w-full px-3 py-2", className)}
             >
+                {attachments.length > 0 && (
+                    <AttachmentTray
+                        attachments={attachments}
+                        onRemove={removeAttachment}
+                        className="mb-1.5"
+                    />
+                )}
                 <Editable
                     className="w-full focus-visible:outline-none focus-visible:ring-0 bg-background text-base disabled:cursor-not-allowed disabled:opacity-50 md:text-sm max-h-64 overflow-y-auto"
                     placeholder="Ask a question about your code. @mention files or select search scopes to refine your query."
@@ -371,8 +421,19 @@ const ChatBoxComponent = ({
                     renderLeaf={renderLeaf}
                     onKeyDown={onKeyDown}
                     readOnly={isDisabled}
+                    onPaste={(event) => {
+                        const files = event.clipboardData?.files ? Array.from(event.clipboardData.files) : [];
+                        if (files.length > 0) {
+                            event.preventDefault();
+                            void onAddFiles(files);
+                        }
+                    }}
                 />
-                <div className="ml-auto z-10">
+                <div className="flex flex-row items-center justify-end gap-1 z-10">
+                    <AttachmentButton
+                        onAddFiles={onAddFiles}
+                        disabled={isDisabled || isRedirecting || isTurnInProgress}
+                    />
                     {isRedirecting ? (
                         <Button
                             variant="default"
@@ -455,7 +516,7 @@ const ChatBoxComponent = ({
     )
 }
 
-export const ChatBox = memo(ChatBoxComponent, isEqual);
+export const ChatBox = memo(forwardRef(ChatBoxComponent), isEqual);
 
 const DefaultElement = (props: RenderElementProps) => {
     return <p {...props.attributes}>{props.children}</p>
diff --git a/packages/web/src/features/chat/components/chatBox/chatPaneDropzone.tsx b/packages/web/src/features/chat/components/chatBox/chatPaneDropzone.tsx
new file mode 100644
index 000000000..f7b5d0f59
--- /dev/null
+++ b/packages/web/src/features/chat/components/chatBox/chatPaneDropzone.tsx
@@ -0,0 +1,96 @@
+'use client';
+
+import { useToast } from "@/components/hooks/use-toast";
+import { getAttachmentDropzoneAccept } from "@/features/chat/attachmentUtils";
+import { cn } from "@/lib/utils";
+import { FileUp } from "lucide-react";
+import { ReactNode, useEffect, useState } from "react";
+import { useDropzone } from "react-dropzone";
+
+interface ChatPaneDropzoneProps {
+    onFilesDropped: (files: File[]) => void;
+    disabled?: boolean;
+    className?: string;
+    children: ReactNode;
+}
+
+// Makes an entire chat pane a drag-and-drop target for attachments. Drops are
+// forwarded to the chat box (which owns attachment state) via `onFilesDropped`.
+// `noClick`/`noKeyboard` keep the zone from hijacking clicks/keys; the file
+// picker is opened separately from the attachment button.
+export const ChatPaneDropzone = ({ onFilesDropped, disabled, className, children }: ChatPaneDropzoneProps) => {
+    const { toast } = useToast();
+    // Only surface the overlay when actual files are being dragged (not, e.g., a
+    // text selection dragged within the editor). `dragFileCount` is the number of
+    // files in the active drag (when the browser exposes it).
+    const [isDraggingFiles, setIsDraggingFiles] = useState(false);
+    const [dragFileCount, setDragFileCount] = useState(0);
+
+    const { getRootProps, getInputProps, isDragActive, isDragReject } = useDropzone({
+        accept: getAttachmentDropzoneAccept(),
+        multiple: true,
+        noClick: true,
+        noKeyboard: true,
+        disabled,
+        onDrop: (acceptedFiles, fileRejections) => {
+            setIsDraggingFiles(false);
+            if (acceptedFiles.length > 0) {
+                onFilesDropped(acceptedFiles);
+            }
+            if (fileRejections.length > 0) {
+                toast({
+                    description: `⚠️ Unsupported file type: ${fileRejections.map((rejection) => rejection.file.name).join(', ')}. Text files only.`,
+                    variant: "destructive",
+                });
+            }
+        },
+    });
+
+    // react-dropzone clears `isDragActive` when the drag leaves; mirror that for
+    // our files flag so the overlay never gets stuck.
+    useEffect(() => {
+        if (!isDragActive) {
+            setIsDraggingFiles(false);
+            setDragFileCount(0);
+        }
+    }, [isDragActive]);
+
+    const showOverlay = isDragActive && isDraggingFiles && !disabled;
+
+    return (
+        <div
+            {...getRootProps({
+                className: cn("relative", className),
+                onDragEnter: (event) => {
+                    const types = event.dataTransfer?.types ?? [];
+                    setIsDraggingFiles(types.includes('Files'));
+                    const items = event.dataTransfer?.items;
+                    setDragFileCount(items ? Array.from(items).filter((item) => item.kind === 'file').length : 0);
+                },
+            })}
+        >
+            <input {...getInputProps()} />
+            {showOverlay && (
+                <div className="absolute inset-0 z-30 flex items-center justify-center rounded-md bg-background/70 backdrop-blur-sm pointer-events-none animate-in fade-in-0 duration-150">
+                    <div
+                        className={cn(
+                            "flex flex-col items-center gap-2 rounded-xl border-2 border-dashed px-8 py-6 shadow-lg animate-in fade-in-0 zoom-in-95 duration-150",
+                            isDragReject ? "border-destructive bg-destructive/5" : "border-primary bg-primary/5",
+                        )}
+                    >
+                        <FileUp className={cn("w-8 h-8", isDragReject ? "text-destructive" : "text-primary")} />
+                        <span className="text-lg font-medium text-foreground">
+                            {isDragReject ? "Unsupported file type" : "Drop to attach"}
+                        </span>
+                        {dragFileCount > 0 && (
+                            <span className="text-xs text-muted-foreground">
+                                {dragFileCount} file{dragFileCount === 1 ? '' : 's'}
+                            </span>
+                        )}
+                    </div>
+                </div>
+            )}
+            {children}
+        </div>
+    )
+}
diff --git a/packages/web/src/features/chat/components/chatBox/index.ts b/packages/web/src/features/chat/components/chatBox/index.ts
index 94fcf52fc..b35b49675 100644
--- a/packages/web/src/features/chat/components/chatBox/index.ts
+++ b/packages/web/src/features/chat/components/chatBox/index.ts
@@ -1 +1,2 @@
-export { ChatBox } from "./chatBox";
\ No newline at end of file
+export { ChatBox } from "./chatBox";
+export type { ChatBoxHandle } from "./chatBox";
diff --git a/packages/web/src/features/chat/constants.ts b/packages/web/src/features/chat/constants.ts
index db518d2aa..4036e8c80 100644
--- a/packages/web/src/features/chat/constants.ts
+++ b/packages/web/src/features/chat/constants.ts
@@ -12,3 +12,32 @@ export const SET_CHAT_STATE_SESSION_STORAGE_KEY = 'setChatState';
 export const PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY = 'pendingChatSubmission';
 export const DISABLED_MCP_SERVER_IDS_LOCAL_STORAGE_KEY = 'disabledMcpServerIds';
 export const MCP_OAUTH_DRAFT_SESSION_STORAGE_KEY = 'mcpOAuthDraft';
+
+// Text attachment limits. Text is inlined into the message (and, for new
+// threads, into the sessionStorage stash), so caps are kept conservative to
+// bound `messages` JSON growth and stay well under the sessionStorage limit.
+export const ATTACHMENT_MAX_TEXT_BYTES = 256 * 1024; // 256KB per file
+export const ATTACHMENT_MAX_COUNT = 5; // per message
+export const ATTACHMENT_MAX_FILENAME_LENGTH = 200; // characters
+
+// Allowlist for inline-text attachments. Files are accepted if their MIME type
+// starts with `text/`, exactly matches an entry here, or their extension is in
+// ATTACHMENT_ALLOWED_TEXT_EXTENSIONS. Many code files report an empty MIME type
+// in the browser, hence the extension fallback.
+export const ATTACHMENT_ALLOWED_TEXT_MIME_TYPES = [
+    'application/json',
+    'application/xml',
+    'application/x-yaml',
+    'application/yaml',
+    'application/csv',
+    'application/toml',
+];
+
+export const ATTACHMENT_ALLOWED_TEXT_EXTENSIONS = [
+    'txt', 'md', 'markdown', 'log', 'csv', 'tsv', 'json', 'jsonl', 'yaml', 'yml',
+    'toml', 'ini', 'cfg', 'conf', 'env', 'xml', 'html', 'css', 'scss',
+    'js', 'jsx', 'ts', 'tsx', 'mjs', 'cjs', 'py', 'rb', 'go', 'rs', 'java',
+    'kt', 'kts', 'c', 'h', 'cpp', 'cc', 'hpp', 'cs', 'php', 'swift', 'scala',
+    'sh', 'bash', 'zsh', 'sql', 'graphql', 'gql', 'proto', 'dockerfile',
+    'gitignore', 'tf', 'tfvars', 'lua', 'r', 'pl', 'dart', 'vue', 'svelte',
+];
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index 50792dcb8..5e05493c1 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -103,6 +103,24 @@ export type SBChatMessageToolTypes = {
     };
 };
 
+// A user-provided file attachment. The `text` variant carries the file's
+// extracted text inline (used for text/code/structured files); binary
+// attachments (images, PDFs) will later add a `blob` variant that references
+// stored bytes by id instead of inlining them.
+export const textAttachmentSchema = z.object({
+    kind: z.literal('text'),
+    filename: z.string(),
+    mediaType: z.string(),
+    sizeBytes: z.number(),
+    text: z.string(),
+});
+export type TextAttachment = z.infer<typeof textAttachmentSchema>;
+
+export const attachmentDataSchema = z.discriminatedUnion('kind', [
+    textAttachmentSchema,
+]);
+export type AttachmentData = z.infer<typeof attachmentDataSchema>;
+
 export type SBChatMessageDataParts = {
     // The `source` data type allows us to know what sources the LLM saw
     // during retrieval.
@@ -112,6 +130,8 @@ export type SBChatMessageDataParts = {
     "mcp-server": { sanitizedName: string; faviconUrl: string },
     // The `mcp-failed-server` data type surfaces MCP servers that failed to load their tools.
     "mcp-failed-server": { serverName: string },
+    // A user-provided file attachment included with the message.
+    "attachment": AttachmentData,
 }
 
 export type SBChatMessage = UIMessage<
diff --git a/packages/web/src/features/chat/useCreateNewChatThread.ts b/packages/web/src/features/chat/useCreateNewChatThread.ts
index f030f186d..ef5018ccf 100644
--- a/packages/web/src/features/chat/useCreateNewChatThread.ts
+++ b/packages/web/src/features/chat/useCreateNewChatThread.ts
@@ -9,7 +9,7 @@ import { useRouter } from "next/navigation";
 import { createChat } from "./actions";
 import { isServiceError } from "@/lib/utils";
 import { createPathWithQueryParams } from "@/lib/utils";
-import { SearchScope, SetChatStatePayload } from "./types";
+import { AttachmentData, SearchScope, SetChatStatePayload } from "./types";
 import { DISABLED_MCP_SERVER_IDS_LOCAL_STORAGE_KEY, SELECTED_SEARCH_SCOPES_LOCAL_STORAGE_KEY, SET_CHAT_STATE_SESSION_STORAGE_KEY } from "./constants";
 import { useSessionStorage } from "usehooks-ts";
 
@@ -19,7 +19,7 @@ export const useCreateNewChatThread = () => {
     const router = useRouter();
     const [, setChatState] = useSessionStorage<SetChatStatePayload | null>(SET_CHAT_STATE_SESSION_STORAGE_KEY, null);
 
-    const createNewChatThread = useCallback(async (children: Descendant[], overrideSearchScopes?: SearchScope[], overrideDisabledMcpServerIds?: string[]) => {
+    const createNewChatThread = useCallback(async (children: Descendant[], overrideSearchScopes?: SearchScope[], overrideDisabledMcpServerIds?: string[], attachments: AttachmentData[] = []) => {
         const text = slateContentToString(children);
         const mentions = getAllMentionElements(children);
 
@@ -41,7 +41,7 @@ export const useCreateNewChatThread = () => {
 
         const selectedSearchScopes = overrideSearchScopes ?? storedScopes;
         const disabledMcpServerIds = overrideDisabledMcpServerIds ?? storedDisabledMcpServerIds;
-        const inputMessage = createUIMessage(text, mentions.map((mention) => mention.data), selectedSearchScopes, disabledMcpServerIds);
+        const inputMessage = createUIMessage(text, mentions.map((mention) => mention.data), selectedSearchScopes, disabledMcpServerIds, attachments);
 
         setIsLoading(true);
         const response = await createChat({ source: 'sourcebot-web-client' });
diff --git a/packages/web/src/features/chat/utils.ts b/packages/web/src/features/chat/utils.ts
index 108bd9cc2..3f6742c19 100644
--- a/packages/web/src/features/chat/utils.ts
+++ b/packages/web/src/features/chat/utils.ts
@@ -2,8 +2,9 @@ import { BrowseHighlightRange, getBrowsePath } from "@/app/(app)/browse/hooks/ut
 import { CreateUIMessage, isToolUIPart, TextUIPart, UIMessagePart } from "ai";
 import type { ChatStatus, DynamicToolUIPart, ToolUIPart } from "ai";
 import { Descendant, Editor, Point, Range, Transforms } from "slate";
-import { ANSWER_TAG, FILE_REFERENCE_PREFIX, FILE_REFERENCE_REGEX } from "./constants";
+import { ANSWER_TAG, ATTACHMENT_MAX_FILENAME_LENGTH, FILE_REFERENCE_PREFIX, FILE_REFERENCE_REGEX } from "./constants";
 import {
+    AttachmentData,
     CustomEditor,
     CustomText,
     FileReference,
@@ -187,7 +188,7 @@ export const addLineNumbers = (source: string, lineOffset = 1) => {
     return source.split('\n').map((line, index) => `${index + lineOffset}: ${line}`).join('\n');
 }
 
-export const createUIMessage = (text: string, mentions: MentionData[], selectedSearchScopes: SearchScope[], disabledMcpServerIds: string[] = []): CreateUIMessage<SBChatMessage> => {
+export const createUIMessage = (text: string, mentions: MentionData[], selectedSearchScopes: SearchScope[], disabledMcpServerIds: string[] = [], attachments: AttachmentData[] = []): CreateUIMessage<SBChatMessage> => {
     // Converts applicable mentions into sources.
     const sources: Source[] = mentions
         .map((mention) => {
@@ -217,6 +218,10 @@ export const createUIMessage = (text: string, mentions: MentionData[], selectedS
                 type: 'data-source',
                 data,
             })) as UIMessagePart<{ source: Source }, SBChatMessageToolTypes>[],
+            ...attachments.map((data) => ({
+                type: 'data-attachment',
+                data,
+            })) as UIMessagePart<{ attachment: AttachmentData }, SBChatMessageToolTypes>[],
         ],
         metadata: {
             selectedSearchScopes,
@@ -405,6 +410,40 @@ export const getUserMessageText = (message: Pick<SBChatMessage, 'parts'>): strin
     return message.parts.find((part) => part.type === 'text')?.text ?? '';
 }
 
+// Extracts the inline-text attachments a user included with a message.
+export const getUserMessageAttachments = (message: Pick<SBChatMessage, 'parts'>): AttachmentData[] => {
+    return message.parts
+        .filter((part) => part.type === 'data-attachment')
+        .map((part) => part.data);
+}
+
+// Formats a user message's attachments into a delimited block suitable for
+// inlining into that turn's content. Returns an empty string when there are no
+// (text) attachments. `maxBytesPerAttachment` defensively truncates each
+// attachment's text (defense-in-depth against an oversized client payload).
+export const formatAttachmentsForPrompt = (attachments: AttachmentData[], maxBytesPerAttachment?: number): string => {
+    const textAttachments = attachments.filter((attachment) => attachment.kind === 'text');
+    if (textAttachments.length === 0) {
+        return '';
+    }
+
+    const blocks = textAttachments.map((attachment) => {
+        const text = maxBytesPerAttachment !== undefined
+            ? attachment.text.slice(0, maxBytesPerAttachment)
+            : attachment.text;
+        // Defense-in-depth: keep the filename on a single line, escape quotes,
+        // and cap its length so a crafted client can't break the tag or bloat
+        // the prompt (the client also sanitizes via sanitizeFilename).
+        const filename = attachment.filename
+            .replace(/\s+/g, ' ')
+            .replace(/"/g, '&quot;')
+            .slice(0, ATTACHMENT_MAX_FILENAME_LENGTH);
+        return `<attachment filename="${filename}" media-type="${attachment.mediaType}">\n${text}\n</attachment>`;
+    });
+
+    return `<attachments>\n${blocks.join('\n')}\n</attachments>`;
+}
+
 // Attempts to find the part of the assistant's message
 // that contains the answer.
 export const getAnswerPartFromAssistantMessage = (message: SBChatMessage, isTurnInProgress: boolean): TextUIPart | undefined => {
diff --git a/yarn.lock b/yarn.lock
index be48b2e71..c24dd251d 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -9466,6 +9466,7 @@ __metadata:
     react-day-picker: "npm:^9.14.0"
     react-device-detect: "npm:^2.2.3"
     react-dom: "npm:19.2.4"
+    react-dropzone: "npm:^15.0.0"
     react-email: "npm:^6.1.4"
     react-grab: "npm:^0.1.23"
     react-hook-form: "npm:^7.53.0"
@@ -11346,6 +11347,13 @@ __metadata:
   languageName: node
   linkType: hard
 
+"attr-accept@npm:^2.2.4":
+  version: 2.2.5
+  resolution: "attr-accept@npm:2.2.5"
+  checksum: 10c0/9b4cb82213925cab2d568f71b3f1c7a7778f9192829aac39a281e5418cd00c04a88f873eb89f187e0bf786fa34f8d52936f178e62cbefb9254d57ecd88ada99b
+  languageName: node
+  linkType: hard
+
 "available-typed-arrays@npm:^1.0.7":
   version: 1.0.7
   resolution: "available-typed-arrays@npm:1.0.7"
@@ -14436,6 +14444,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"file-selector@npm:^2.1.0":
+  version: 2.1.2
+  resolution: "file-selector@npm:2.1.2"
+  dependencies:
+    tslib: "npm:^2.7.0"
+  checksum: 10c0/fe827e0e95410aacfcc3eabc38c29cc36055257f03c1c06b631a2b5af9730c142ad2c52f5d64724d02231709617bda984701f52bd1f4b7aca50fb6585a27c1d2
+  languageName: node
+  linkType: hard
+
 "fill-range@npm:^7.1.1":
   version: 7.1.1
   resolution: "fill-range@npm:7.1.1"
@@ -19632,6 +19649,19 @@ __metadata:
   languageName: node
   linkType: hard
 
+"react-dropzone@npm:^15.0.0":
+  version: 15.0.0
+  resolution: "react-dropzone@npm:15.0.0"
+  dependencies:
+    attr-accept: "npm:^2.2.4"
+    file-selector: "npm:^2.1.0"
+    prop-types: "npm:^15.8.1"
+  peerDependencies:
+    react: ">= 16.8 || 18.0.0"
+  checksum: 10c0/fb7b48a709fdd26273707f7aca5c0e77fce2b9c9201122645d3ecfb07ecfbb89e2495273ea141994f0ed0838ee79f27832c0855b2c598b377b342c3965608b54
+  languageName: node
+  linkType: hard
+
 "react-email@npm:^6.1.4":
   version: 6.1.4
   resolution: "react-email@npm:6.1.4"
@@ -22330,7 +22360,7 @@ __metadata:
   languageName: node
   linkType: hard
 
-"tslib@npm:^2.0.0, tslib@npm:^2.1.0, tslib@npm:^2.4.0, tslib@npm:^2.6.2, tslib@npm:^2.8.0, tslib@npm:^2.8.1":
+"tslib@npm:^2.0.0, tslib@npm:^2.1.0, tslib@npm:^2.4.0, tslib@npm:^2.6.2, tslib@npm:^2.7.0, tslib@npm:^2.8.0, tslib@npm:^2.8.1":
   version: 2.8.1
   resolution: "tslib@npm:2.8.1"
   checksum: 10c0/9c4759110a19c53f992d9aae23aac5ced636e99887b51b9e61def52611732872ff7668757d4e4c61f19691e36f4da981cd9485e869b4a7408d689f6bf1f14e62

From 5d69749df4ac19c9e449457186f390e5008f55af Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 16:33:22 -0700
Subject: [PATCH 08/19] escape key handle for modal and add missing description
 component

---
 .../chatBox/attachmentViewerDialog.tsx        | 27 ++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx b/packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx
index 588a142df..46dc236dc 100644
--- a/packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx
+++ b/packages/web/src/features/chat/components/chatBox/attachmentViewerDialog.tsx
@@ -1,6 +1,7 @@
 'use client';
 
-import { Dialog, DialogContent, DialogHeader, DialogTitle } from "@/components/ui/dialog";
+import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "@/components/ui/dialog";
+import { useEffect } from "react";
 
 interface AttachmentViewerDialogProps {
     filename?: string;
@@ -12,6 +13,27 @@ interface AttachmentViewerDialogProps {
 // Shared viewer for inspecting an inline-text attachment's contents. Used for
 // both staged (not-yet-sent) and sent attachments.
 export const AttachmentViewerDialog = ({ filename, text, open, onOpenChange }: AttachmentViewerDialogProps) => {
+    // The staged viewer is rendered inside the Slate `Editable` subtree, where
+    // Radix's built-in Escape-to-close can get swallowed by the editor's
+    // focus/key handling. A capture-phase listener guarantees Escape closes the
+    // dialog, matching every other modal in the app.
+    useEffect(() => {
+        if (!open) {
+            return;
+        }
+
+        const handleKeyDown = (event: KeyboardEvent) => {
+            if (event.key === 'Escape') {
+                onOpenChange(false);
+            }
+        };
+
+        document.addEventListener('keydown', handleKeyDown, true);
+        return () => {
+            document.removeEventListener('keydown', handleKeyDown, true);
+        };
+    }, [open, onOpenChange]);
+
     return (
         <Dialog open={open} onOpenChange={onOpenChange}>
             <DialogContent className="max-w-3xl">
@@ -19,6 +41,9 @@ export const AttachmentViewerDialog = ({ filename, text, open, onOpenChange }: A
                     <DialogTitle className="font-mono text-sm break-all">
                         {filename}
                     </DialogTitle>
+                    <DialogDescription className="sr-only">
+                        Preview of the attached file{filename ? ` ${filename}` : ''}.
+                    </DialogDescription>
                 </DialogHeader>
                 <pre className="max-h-[60vh] overflow-auto rounded bg-muted p-3 text-xs whitespace-pre-wrap break-words">
                     {text}

From de291cc8ab301b28067812186f534bcba5b6f69e Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 11:30:15 -0700
Subject: [PATCH 09/19] refactor(web): resolve model capabilities from
 models.dev, not config.json

Re-source language model input-modality / document capabilities from the
models.dev catalog instead of hand-declared config.json fields, aligning
with the move to de-emphasize on-disk config in favor of automatic
resolution (the same catalog already backs context-window resolution).

- Revert the inputModalities/supportedDocumentTypes additions to
  schemas/v3/languageModel.json and all regenerated artifacts; capabilities
  are no longer declared in config.json.
- Extract the shared models.dev catalog plumbing (fetch/TTL/negative-cache/
  stale-while-revalidate/provider-id overrides) into modelsDevCatalog.server.ts,
  now consumed by both context-window and capability resolution.
- Add models.dev-backed resolveModelCapabilities (modelCapabilities.server.ts),
  partitioning the catalog's modalities.input list into Sourcebot's
  inputModalities (channels) and supportedDocumentTypes (containers); falls back
  to text-only for uncatalogued / self-hosted models.

The client-safe LanguageModelInfo contract is unchanged; only the resolution
backend moved.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 CHANGELOG.md                                  |   2 +-
 docs/snippets/schemas/v3/index.schema.mdx     | 504 ------------------
 .../schemas/v3/languageModel.schema.mdx       | 504 ------------------
 packages/schemas/src/v3/index.schema.ts       | 504 ------------------
 packages/schemas/src/v3/index.type.ts         |  96 ----
 .../schemas/src/v3/languageModel.schema.ts    | 504 ------------------
 packages/schemas/src/v3/languageModel.type.ts |  96 ----
 .../web/src/ee/features/mcp/askCodebase.ts    |   7 +-
 .../chat/modelCapabilities.server.test.ts     | 126 +++++
 .../features/chat/modelCapabilities.server.ts |  64 +++
 .../src/features/chat/modelCapabilities.ts    |  24 -
 .../chat/modelContextWindow.server.ts         |  99 +---
 .../features/chat/modelsDevCatalog.server.ts  | 111 ++++
 .../web/src/features/chat/utils.server.ts     |  17 +-
 schemas/v3/languageModel.json                 | 254 +--------
 15 files changed, 322 insertions(+), 2590 deletions(-)
 create mode 100644 packages/web/src/features/chat/modelCapabilities.server.test.ts
 create mode 100644 packages/web/src/features/chat/modelCapabilities.server.ts
 delete mode 100644 packages/web/src/features/chat/modelCapabilities.ts
 create mode 100644 packages/web/src/features/chat/modelsDevCatalog.server.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27408c782..68e63d675 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353)
 - [EE] Added mermaid diagram rendering to Ask Sourcebot answers, with pan/zoom, copy/export, in-thread deep links, and an interleaved right-panel view. [#1369](https://github.com/sourcebot-dev/sourcebot/pull/1369)
 - [EE] Added a context-window usage gauge to the Ask Sourcebot chat details, showing how much of the selected model's context window each turn occupies. Window sizes are resolved from the models.dev catalog. [#1370](https://github.com/sourcebot-dev/sourcebot/pull/1370)
-- Added optional `inputModalities` and `supportedDocumentTypes` configuration for language models, exposing model input-modality and document capabilities (defaults to text-only, no documents). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
+- Added language model input-modality and document capability resolution, automatically resolved from the models.dev catalog (falls back to text-only for uncatalogued/self-hosted models). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
 
 ### Fixed
 - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367)
diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index 356da2009..864359251 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1860,27 +1860,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2019,27 +1998,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2175,27 +2133,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2303,27 +2240,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2445,27 +2361,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2589,27 +2484,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2749,27 +2623,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2877,27 +2730,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3031,27 +2863,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3216,27 +3027,6 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3345,27 +3135,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3477,27 +3246,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3678,27 +3426,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3837,27 +3564,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3993,27 +3699,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4121,27 +3806,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4263,27 +3927,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4407,27 +4050,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4567,27 +4189,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4695,27 +4296,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4849,27 +4429,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5034,27 +4593,6 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5163,27 +4701,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5295,27 +4812,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 5af4b3d96..90aee08af 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -174,27 +174,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -333,27 +312,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -489,27 +447,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -617,27 +554,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -759,27 +675,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -903,27 +798,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1063,27 +937,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1191,27 +1044,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1345,27 +1177,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1530,27 +1341,6 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1659,27 +1449,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1791,27 +1560,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1992,27 +1740,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2151,27 +1878,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2307,27 +2013,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2435,27 +2120,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2577,27 +2241,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2721,27 +2364,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2881,27 +2503,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3009,27 +2610,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3163,27 +2743,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3348,27 +2907,6 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3477,27 +3015,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3609,27 +3126,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 123fd4a8b..8c1d64b52 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1859,27 +1859,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2018,27 +1997,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2174,27 +2132,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2302,27 +2239,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2444,27 +2360,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2588,27 +2483,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2748,27 +2622,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2876,27 +2729,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3030,27 +2862,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3215,27 +3026,6 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3344,27 +3134,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3476,27 +3245,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3677,27 +3425,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3836,27 +3563,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3992,27 +3698,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4120,27 +3805,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4262,27 +3926,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4406,27 +4049,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4566,27 +4188,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4694,27 +4295,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4848,27 +4428,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5033,27 +4592,6 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5162,27 +4700,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5294,27 +4811,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index d6f555e8d..7fa7f5a17 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -762,14 +762,6 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -850,14 +842,6 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -913,14 +897,6 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -960,14 +936,6 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -1015,14 +983,6 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -1070,14 +1030,6 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -1133,14 +1085,6 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -1180,14 +1124,6 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -1235,14 +1171,6 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -1287,14 +1215,6 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -1359,14 +1279,6 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -1406,14 +1318,6 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GitHubAppConfig {
   /**
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index 61cc0adf3..ab418ce79 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -173,27 +173,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -332,27 +311,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -488,27 +446,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -616,27 +553,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -758,27 +674,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -902,27 +797,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1062,27 +936,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1190,27 +1043,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1344,27 +1176,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1529,27 +1340,6 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1658,27 +1448,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1790,27 +1559,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1991,27 +1739,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2150,27 +1877,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2306,27 +2012,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2434,27 +2119,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2576,27 +2240,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2720,27 +2363,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2880,27 +2502,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3008,27 +2609,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3162,27 +2742,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3347,27 +2906,6 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3476,27 +3014,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3608,27 +3125,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index 90a53b423..5c3b25668 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -88,14 +88,6 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -176,14 +168,6 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -239,14 +223,6 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -286,14 +262,6 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -341,14 +309,6 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -396,14 +356,6 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -459,14 +411,6 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -506,14 +450,6 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -561,14 +497,6 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -613,14 +541,6 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -685,14 +605,6 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -732,12 +644,4 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/web/src/ee/features/mcp/askCodebase.ts b/packages/web/src/ee/features/mcp/askCodebase.ts
index 59f8a35ec..35337d29f 100644
--- a/packages/web/src/ee/features/mcp/askCodebase.ts
+++ b/packages/web/src/ee/features/mcp/askCodebase.ts
@@ -5,7 +5,7 @@ import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server";
 import { resolveContextWindow } from "@/features/chat/modelContextWindow.server";
 import { LanguageModelInfo, SBChatMessage, SearchScope } from "@/features/chat/types";
 import { convertLLMOutputToPortableMarkdown, getAnswerPartFromAssistantMessage, getLanguageModelKey } from "@/features/chat/utils";
-import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from "@/features/chat/modelCapabilities";
+import { resolveModelCapabilities } from "@/features/chat/modelCapabilities.server";
 import { ErrorCode } from "@/lib/errorCodes";
 import { ServiceError, ServiceErrorException } from "@/lib/serviceError";
 import { withOptionalAuth } from "@/middleware/withAuth";
@@ -87,6 +87,7 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
             const { model, providerOptions, temperature } = await getAISDKLanguageModelAndOptions(languageModelConfig);
             const modelName = languageModelConfig.displayName ?? languageModelConfig.model;
             const contextWindow = await resolveContextWindow(languageModelConfig);
+            const { inputModalities, supportedDocumentTypes } = await resolveModelCapabilities(languageModelConfig);
 
             // No-op for non-Anthropic providers / when caching is disabled.
             const promptCacheStrategy = getPromptCacheStrategy(
@@ -247,8 +248,8 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
                     provider: languageModelConfig.provider,
                     model: languageModelConfig.model,
                     displayName: languageModelConfig.displayName,
-                    inputModalities: resolveModelInputModalities(languageModelConfig),
-                    supportedDocumentTypes: resolveModelSupportedDocumentTypes(languageModelConfig),
+                    inputModalities,
+                    supportedDocumentTypes,
                 },
             } satisfies AskCodebaseResult;
         })
diff --git a/packages/web/src/features/chat/modelCapabilities.server.test.ts b/packages/web/src/features/chat/modelCapabilities.server.test.ts
new file mode 100644
index 000000000..0a2e9a2ec
--- /dev/null
+++ b/packages/web/src/features/chat/modelCapabilities.server.test.ts
@@ -0,0 +1,126 @@
+import { afterEach, describe, expect, test, vi } from 'vitest';
+import type { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
+
+vi.mock('server-only', () => ({ default: vi.fn() }));
+
+vi.mock('@sourcebot/shared', () => ({
+    createLogger: () => ({
+        info: vi.fn(),
+        warn: vi.fn(),
+        error: vi.fn(),
+        debug: vi.fn(),
+    }),
+}));
+
+import { lookupModelCapabilities, resolveModelCapabilities } from './modelCapabilities.server';
+import type { ModelsDevCatalog } from './modelsDevCatalog.server';
+
+const catalog: ModelsDevCatalog = {
+    anthropic: {
+        id: 'anthropic',
+        models: {
+            // Text + image + a document (pdf) container format.
+            'claude-sonnet-4-5': {
+                id: 'claude-sonnet-4-5',
+                modalities: { input: ['text', 'image', 'pdf'], output: ['text'] },
+            },
+        },
+    },
+    // models.dev keys Gemini under 'google', whereas Sourcebot's provider id is
+    // 'google-generative-ai' — exercises the provider id override.
+    google: {
+        id: 'google',
+        models: {
+            'gemini-2.5-pro': {
+                id: 'gemini-2.5-pro',
+                modalities: { input: ['text', 'image', 'audio', 'video', 'pdf'], output: ['text'] },
+            },
+        },
+    },
+    openai: {
+        id: 'openai',
+        models: {
+            // Catalogued model that omits `text` from its input list.
+            'image-only': { id: 'image-only', modalities: { input: ['image'], output: ['text'] } },
+            // Catalogued model with no `modalities` object at all.
+            'no-modalities-model': { id: 'no-modalities-model' },
+        },
+    },
+};
+
+const model = (provider: string, modelId: string) =>
+    ({ provider, model: modelId }) as Pick<LanguageModel, 'provider' | 'model'>;
+
+describe('lookupModelCapabilities', () => {
+    test('splits modalities and document types for a direct provider/model hit', () => {
+        expect(lookupModelCapabilities(catalog, model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+            inputModalities: ['text', 'image'],
+            supportedDocumentTypes: ['pdf'],
+        });
+    });
+
+    test('maps google-generative-ai to the catalog\'s google key', () => {
+        expect(lookupModelCapabilities(catalog, model('google-generative-ai', 'gemini-2.5-pro'))).toEqual({
+            inputModalities: ['text', 'image', 'audio', 'video'],
+            supportedDocumentTypes: ['pdf'],
+        });
+    });
+
+    test('always includes text even when the catalog omits it', () => {
+        expect(lookupModelCapabilities(catalog, model('openai', 'image-only'))).toEqual({
+            inputModalities: ['text', 'image'],
+            supportedDocumentTypes: [],
+        });
+    });
+
+    test('falls back to text-only for a catalogued model with no modalities', () => {
+        expect(lookupModelCapabilities(catalog, model('openai', 'no-modalities-model'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+    });
+
+    test('falls back to text-only for an uncatalogued model (e.g. openai-compatible / self-hosted)', () => {
+        expect(lookupModelCapabilities(catalog, model('openai-compatible', 'my-local-model'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+        expect(lookupModelCapabilities(catalog, model('anthropic', 'claude-unknown'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+    });
+
+    test('falls back to text-only when the catalog is null (fetch failed / unreachable)', () => {
+        expect(lookupModelCapabilities(null, model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+    });
+});
+
+describe('resolveModelCapabilities', () => {
+    afterEach(() => {
+        vi.unstubAllGlobals();
+    });
+
+    test('fetches the catalog once and resolves capabilities (incl. provider mapping)', async () => {
+        const fetchMock = vi.fn(async () => ({
+            ok: true,
+            json: async () => catalog,
+        }) as unknown as Response);
+        vi.stubGlobal('fetch', fetchMock);
+
+        expect(await resolveModelCapabilities(model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+            inputModalities: ['text', 'image'],
+            supportedDocumentTypes: ['pdf'],
+        });
+        // Subsequent lookups reuse the cached catalog rather than refetching.
+        expect(await resolveModelCapabilities(model('google-generative-ai', 'gemini-2.5-pro'))).toEqual({
+            inputModalities: ['text', 'image', 'audio', 'video'],
+            supportedDocumentTypes: ['pdf'],
+        });
+
+        expect(fetchMock).toHaveBeenCalledTimes(1);
+    });
+});
diff --git a/packages/web/src/features/chat/modelCapabilities.server.ts b/packages/web/src/features/chat/modelCapabilities.server.ts
new file mode 100644
index 000000000..87d2cb131
--- /dev/null
+++ b/packages/web/src/features/chat/modelCapabilities.server.ts
@@ -0,0 +1,64 @@
+import 'server-only';
+
+import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
+import { DocumentType, InputModality } from './types';
+import { loadCatalog, resolveProviderId, type ModelsDevCatalog } from './modelsDevCatalog.server';
+
+// models.dev folds every accepted input — perceptual channels (text, image,
+// audio, video) AND container formats (pdf) — into a single `modalities.input`
+// list. Sourcebot keeps those two concepts apart: `inputModalities` are the
+// raw channels a model encodes, while `supportedDocumentTypes` are rich
+// compound formats providers decompose server-side. We partition the catalog's
+// input list into those two buckets here.
+const INPUT_MODALITY_VALUES = ['text', 'image', 'audio', 'video'] as const satisfies readonly InputModality[];
+const DOCUMENT_TYPE_VALUES = ['pdf'] as const satisfies readonly DocumentType[];
+
+const isInputModality = (value: string): value is InputModality =>
+    (INPUT_MODALITY_VALUES as readonly string[]).includes(value);
+
+const isDocumentType = (value: string): value is DocumentType =>
+    (DOCUMENT_TYPE_VALUES as readonly string[]).includes(value);
+
+export type ModelCapabilities = {
+    inputModalities: InputModality[];
+    supportedDocumentTypes: DocumentType[];
+};
+
+/**
+ * Pure lookup of a model's input capabilities in a models.dev catalog.
+ * Separated from the network fetch so it can be unit-tested directly.
+ *
+ * Resolution is automatic from the catalog — capabilities are NOT hand-declared
+ * in config.json. When a model isn't catalogued (e.g. a self-hosted /
+ * openai-compatible endpoint we can't introspect), we fall back to text-only
+ * with no document support: the model stays fully usable for normal chat, and
+ * richer attachments stay gated off until we can positively confirm support.
+ */
+export const lookupModelCapabilities = (
+    catalog: ModelsDevCatalog | null,
+    config: Pick<LanguageModel, 'provider' | 'model'>,
+): ModelCapabilities => {
+    const providerId = resolveProviderId(config.provider);
+    const inputs = catalog?.[providerId]?.models?.[config.model]?.modalities?.input;
+
+    if (!inputs || inputs.length === 0) {
+        return { inputModalities: ['text'], supportedDocumentTypes: [] };
+    }
+
+    const inputModalities = inputs.filter(isInputModality);
+    const supportedDocumentTypes = inputs.filter(isDocumentType);
+
+    // Every model accepts text, even if the catalog omits it from the list.
+    if (!inputModalities.includes('text')) {
+        inputModalities.unshift('text');
+    }
+
+    return { inputModalities, supportedDocumentTypes };
+};
+
+export const resolveModelCapabilities = async (
+    config: Pick<LanguageModel, 'provider' | 'model'>,
+): Promise<ModelCapabilities> => {
+    const catalog = await loadCatalog();
+    return lookupModelCapabilities(catalog, config);
+};
diff --git a/packages/web/src/features/chat/modelCapabilities.ts b/packages/web/src/features/chat/modelCapabilities.ts
deleted file mode 100644
index 8b976af59..000000000
--- a/packages/web/src/features/chat/modelCapabilities.ts
+++ /dev/null
@@ -1,24 +0,0 @@
-import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { DocumentType, InputModality } from './types';
-
-// Fail-closed: when a model does not declare input modalities, assume text-only.
-// NOTE: future work may add live provider capability probing (see
-// tryResolveAnthropicThinkingConfig in llm.server.ts for the precedent).
-export const resolveModelInputModalities = (config: LanguageModel): InputModality[] => {
-    const declared = config.inputModalities;
-    if (declared && declared.length > 0) {
-        return declared;
-    }
-    return ['text'];
-}
-
-// Fail-closed: when a model does not declare supported document types, assume none.
-// Document types (e.g. PDF) are container formats distinct from raw input
-// modalities, since providers decompose them into text/image internally.
-export const resolveModelSupportedDocumentTypes = (config: LanguageModel): DocumentType[] => {
-    const declared = config.supportedDocumentTypes;
-    if (declared && declared.length > 0) {
-        return declared;
-    }
-    return [];
-}
diff --git a/packages/web/src/features/chat/modelContextWindow.server.ts b/packages/web/src/features/chat/modelContextWindow.server.ts
index 0e70dc04f..f87bbcf3b 100644
--- a/packages/web/src/features/chat/modelContextWindow.server.ts
+++ b/packages/web/src/features/chat/modelContextWindow.server.ts
@@ -1,100 +1,11 @@
 import 'server-only';
 
 import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { createLogger } from '@sourcebot/shared';
+import { loadCatalog, resolveProviderId, type ModelsDevCatalog } from './modelsDevCatalog.server';
 
-const logger = createLogger('model-context-window');
-
-// The same public, unauthenticated catalog the setup wizard already consumes
-// (see packages/setupWizard/src/models.ts). Each model entry exposes a
-// `limit.context` field holding the total context window in tokens.
-const MODELS_DEV_API_URL = 'https://models.dev/api.json';
-const FETCH_TIMEOUT_MS = 8000;
-// Re-fetch the (~2.4 MB) catalog at most once per this interval per server
-// process. New models trickle in daily; a stale window for a few hours is fine.
-const CATALOG_TTL_MS = 6 * 60 * 60 * 1000;
-// After a failed fetch, don't reattempt for this long. Without it, an outage in
-// models.dev would make every chat send pay the fetch timeout on the request path.
-const NEGATIVE_CACHE_MS = 60 * 1000;
-
-// Sourcebot provider id -> models.dev top-level catalog key. Only providers
-// whose Sourcebot id differs from the models.dev id need an entry; everything
-// else (anthropic, openai, azure, amazon-bedrock, mistral, deepseek, xai,
-// openrouter, google-vertex, google-vertex-anthropic) matches 1:1.
-const PROVIDER_ID_OVERRIDES: Record<string, string> = {
-    'google-generative-ai': 'google',
-};
-
-type ModelsDevModel = {
-    id: string;
-    limit?: {
-        context?: number;
-        output?: number;
-    };
-};
-
-type ModelsDevProvider = {
-    id: string;
-    models?: Record<string, ModelsDevModel>;
-};
-
-export type ModelsDevCatalog = Record<string, ModelsDevProvider>;
-
-// Last successfully-fetched catalog. Served while fresh, and kept as a fallback
-// when a later refresh fails. `catalogFetchedAt` is when it was fetched (TTL),
-// `lastFailedAt` the most recent fetch failure (negative-cache backoff), and
-// `inFlightFetch` dedupes concurrent fetches.
-let cachedCatalog: ModelsDevCatalog | null = null;
-let catalogFetchedAt = 0;
-let lastFailedAt = 0;
-let inFlightFetch: Promise<ModelsDevCatalog | null> | null = null;
-
-const fetchCatalog = async (): Promise<ModelsDevCatalog | null> => {
-    try {
-        const response = await fetch(MODELS_DEV_API_URL, {
-            signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
-        });
-        if (!response.ok) {
-            logger.warn(`Failed to fetch models.dev catalog: ${response.status} ${response.statusText}`);
-            return null;
-        }
-        return await response.json() as ModelsDevCatalog;
-    } catch (error) {
-        logger.warn(`Failed to fetch models.dev catalog: ${error}`);
-        return null;
-    }
-};
-
-const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
-    const now = Date.now();
-    const isFresh = cachedCatalog !== null && now - catalogFetchedAt <= CATALOG_TTL_MS;
-    const isBackingOff = now - lastFailedAt < NEGATIVE_CACHE_MS;
-
-    // Kick off a (deduped) refresh when the cache is stale/empty and we're not
-    // within the post-failure backoff window. On success it replaces the cache;
-    // on failure it only records the failure time, leaving the last-known-good
-    // catalog intact.
-    if (!isFresh && !isBackingOff && !inFlightFetch) {
-        inFlightFetch = fetchCatalog().then((catalog) => {
-            if (catalog) {
-                cachedCatalog = catalog;
-                catalogFetchedAt = Date.now();
-            } else {
-                lastFailedAt = Date.now();
-            }
-            inFlightFetch = null;
-            return catalog;
-        });
-    }
-
-    // Once a catalog has loaded once, never block the request path on the
-    // network: serve the last-known-good value (even if stale) and let any
-    // refresh settle in the background. Only the very first load awaits.
-    if (cachedCatalog !== null) {
-        return cachedCatalog;
-    }
-    return inFlightFetch ?? null;
-};
+// Re-exported so existing consumers/tests can keep importing the catalog type
+// from here.
+export type { ModelsDevCatalog } from './modelsDevCatalog.server';
 
 /**
  * Pure lookup of a model's context window in a models.dev catalog. Separated
@@ -110,7 +21,7 @@ export const lookupContextWindow = (
     if (!catalog) {
         return undefined;
     }
-    const providerId = PROVIDER_ID_OVERRIDES[config.provider] ?? config.provider;
+    const providerId = resolveProviderId(config.provider);
     const context = catalog[providerId]?.models?.[config.model]?.limit?.context;
     // `limit` is schema-optional, and models.dev reports a 0 context window for
     // non-text models (image/audio/etc.). Treat both as "unknown" so the UI
diff --git a/packages/web/src/features/chat/modelsDevCatalog.server.ts b/packages/web/src/features/chat/modelsDevCatalog.server.ts
new file mode 100644
index 000000000..8f6b35043
--- /dev/null
+++ b/packages/web/src/features/chat/modelsDevCatalog.server.ts
@@ -0,0 +1,111 @@
+import 'server-only';
+
+import { createLogger } from '@sourcebot/shared';
+
+const logger = createLogger('models-dev-catalog');
+
+// The same public, unauthenticated catalog the setup wizard already consumes
+// (see packages/setupWizard/src/models.ts). Each model entry exposes a
+// `limit.context` field (total context window in tokens) and a `modalities`
+// field describing the inputs/outputs the model supports natively.
+const MODELS_DEV_API_URL = 'https://models.dev/api.json';
+const FETCH_TIMEOUT_MS = 8000;
+// Re-fetch the (~2.4 MB) catalog at most once per this interval per server
+// process. New models trickle in daily; a stale window for a few hours is fine.
+const CATALOG_TTL_MS = 6 * 60 * 60 * 1000;
+// After a failed fetch, don't reattempt for this long. Without it, an outage in
+// models.dev would make every chat send pay the fetch timeout on the request path.
+const NEGATIVE_CACHE_MS = 60 * 1000;
+
+// Sourcebot provider id -> models.dev top-level catalog key. Only providers
+// whose Sourcebot id differs from the models.dev id need an entry; everything
+// else (anthropic, openai, azure, amazon-bedrock, mistral, deepseek, xai,
+// openrouter, google-vertex, google-vertex-anthropic) matches 1:1.
+const PROVIDER_ID_OVERRIDES: Record<string, string> = {
+    'google-generative-ai': 'google',
+};
+
+export const resolveProviderId = (provider: string): string =>
+    PROVIDER_ID_OVERRIDES[provider] ?? provider;
+
+type ModelsDevModel = {
+    id: string;
+    limit?: {
+        context?: number;
+        output?: number;
+    };
+    modalities?: {
+        // e.g. ["text", "image", "pdf", "audio", "video"]
+        input?: string[];
+        output?: string[];
+    };
+};
+
+type ModelsDevProvider = {
+    id: string;
+    models?: Record<string, ModelsDevModel>;
+};
+
+export type ModelsDevCatalog = Record<string, ModelsDevProvider>;
+
+// Last successfully-fetched catalog. Served while fresh, and kept as a fallback
+// when a later refresh fails. `catalogFetchedAt` is when it was fetched (TTL),
+// `lastFailedAt` the most recent fetch failure (negative-cache backoff), and
+// `inFlightFetch` dedupes concurrent fetches.
+let cachedCatalog: ModelsDevCatalog | null = null;
+let catalogFetchedAt = 0;
+let lastFailedAt = 0;
+let inFlightFetch: Promise<ModelsDevCatalog | null> | null = null;
+
+const fetchCatalog = async (): Promise<ModelsDevCatalog | null> => {
+    try {
+        const response = await fetch(MODELS_DEV_API_URL, {
+            signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
+        });
+        if (!response.ok) {
+            logger.warn(`Failed to fetch models.dev catalog: ${response.status} ${response.statusText}`);
+            return null;
+        }
+        return await response.json() as ModelsDevCatalog;
+    } catch (error) {
+        logger.warn(`Failed to fetch models.dev catalog: ${error}`);
+        return null;
+    }
+};
+
+/**
+ * Returns the cached models.dev catalog, refreshing it in the background when
+ * stale. Only the very first load blocks on the network; thereafter the
+ * last-known-good catalog is served immediately (even if stale) so the request
+ * path never waits on models.dev.
+ */
+export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
+    const now = Date.now();
+    const isFresh = cachedCatalog !== null && now - catalogFetchedAt <= CATALOG_TTL_MS;
+    const isBackingOff = now - lastFailedAt < NEGATIVE_CACHE_MS;
+
+    // Kick off a (deduped) refresh when the cache is stale/empty and we're not
+    // within the post-failure backoff window. On success it replaces the cache;
+    // on failure it only records the failure time, leaving the last-known-good
+    // catalog intact.
+    if (!isFresh && !isBackingOff && !inFlightFetch) {
+        inFlightFetch = fetchCatalog().then((catalog) => {
+            if (catalog) {
+                cachedCatalog = catalog;
+                catalogFetchedAt = Date.now();
+            } else {
+                lastFailedAt = Date.now();
+            }
+            inFlightFetch = null;
+            return catalog;
+        });
+    }
+
+    // Once a catalog has loaded once, never block the request path on the
+    // network: serve the last-known-good value (even if stale) and let any
+    // refresh settle in the background. Only the very first load awaits.
+    if (cachedCatalog !== null) {
+        return cachedCatalog;
+    }
+    return inFlightFetch ?? null;
+};
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index 0b04226d8..a458c41d8 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -7,7 +7,7 @@ import { env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
-import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from './modelCapabilities';
+import { resolveModelCapabilities } from './modelCapabilities.server';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
@@ -128,11 +128,14 @@ export const getConfiguredLanguageModels = async (): Promise<LanguageModel[]> =>
  */
 export const getConfiguredLanguageModelsInfo = async () => {
     const models = await getConfiguredLanguageModels();
-    return models.map((model): LanguageModelInfo => ({
-        provider: model.provider,
-        model: model.model,
-        displayName: model.displayName,
-        inputModalities: resolveModelInputModalities(model),
-        supportedDocumentTypes: resolveModelSupportedDocumentTypes(model),
+    return Promise.all(models.map(async (model): Promise<LanguageModelInfo> => {
+        const { inputModalities, supportedDocumentTypes } = await resolveModelCapabilities(model);
+        return {
+            provider: model.provider,
+            model: model.model,
+            displayName: model.displayName,
+            inputModalities,
+            supportedDocumentTypes,
+        };
     }));
 };
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index a952554b9..3f1d13d52 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -50,27 +50,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -114,27 +93,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -202,27 +160,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -262,27 +199,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -336,27 +252,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -412,27 +307,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -504,27 +378,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -564,27 +417,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -650,27 +482,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -726,27 +537,6 @@
                 "temperature": {
                     "type": "number",
                     "description": "Optional temperature setting to use with the model."
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -787,27 +577,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -851,27 +620,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -919,4 +667,4 @@
             "$ref": "#/definitions/XaiLanguageModel"
         }
     ]
-}
+}
\ No newline at end of file

From fb8843b36db9837bea432cea3a38d7d79d8a3e75 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 12:46:47 -0700
Subject: [PATCH 10/19] paste-to-attachment handling, raw paste keychord
 enabled, toast with fallback

---
 .../web/src/features/chat/attachmentUtils.ts  | 73 ++++++++++++++
 .../chat/components/chatBox/chatBox.tsx       | 96 ++++++++++++++++++-
 packages/web/src/features/chat/constants.ts   |  5 +
 3 files changed, 170 insertions(+), 4 deletions(-)

diff --git a/packages/web/src/features/chat/attachmentUtils.ts b/packages/web/src/features/chat/attachmentUtils.ts
index 75828cb32..ed04fe278 100644
--- a/packages/web/src/features/chat/attachmentUtils.ts
+++ b/packages/web/src/features/chat/attachmentUtils.ts
@@ -6,6 +6,8 @@ import {
     ATTACHMENT_MAX_COUNT,
     ATTACHMENT_MAX_FILENAME_LENGTH,
     ATTACHMENT_MAX_TEXT_BYTES,
+    ATTACHMENT_PASTE_AUTO_CONVERT_MIN_CHARS,
+    ATTACHMENT_PASTE_AUTO_CONVERT_MIN_LINES,
 } from "./constants";
 import { AttachmentData, TextAttachment } from "./types";
 
@@ -112,6 +114,77 @@ const readAsText = (file: File): Promise<string> => {
     });
 }
 
+// Whether a plain-text paste is "large" enough to be automatically converted
+// into a text attachment rather than inserted inline. Gated on both length and
+// shape so a single long sentence isn't swept up, but a multi-line snippet is.
+export const shouldAutoConvertPaste = (text: string): boolean => {
+    if (text.length >= ATTACHMENT_PASTE_AUTO_CONVERT_MIN_CHARS) {
+        return true;
+    }
+    return countLines(text) >= ATTACHMENT_PASTE_AUTO_CONVERT_MIN_LINES;
+}
+
+export const countLines = (text: string): number => {
+    if (text.length === 0) {
+        return 0;
+    }
+    return text.split('\n').length;
+}
+
+// Generates a non-colliding filename for an auto-converted paste, e.g.
+// `pasted.txt`, then `pasted-2.txt`, `pasted-3.txt`, ...
+const getPastedAttachmentFilename = (existing: PendingAttachment[]): string => {
+    const used = new Set(existing.map((attachment) => attachment.filename));
+    if (!used.has('pasted.txt')) {
+        return 'pasted.txt';
+    }
+
+    let index = 2;
+    while (used.has(`pasted-${index}.txt`)) {
+        index++;
+    }
+    return `pasted-${index}.txt`;
+}
+
+export type CreatePastedAttachmentResult =
+    | { ok: true; attachment: PendingAttachment }
+    | { ok: false; error: string };
+
+// Builds a pending text attachment from a pasted string, enforcing the same
+// per-message count and per-attachment size caps as file attachments. Returns
+// a human-readable error instead of throwing when a cap is exceeded.
+export const createPastedTextAttachment = (
+    text: string,
+    existing: PendingAttachment[],
+): CreatePastedAttachmentResult => {
+    if (existing.length >= ATTACHMENT_MAX_COUNT) {
+        return {
+            ok: false,
+            error: `You can attach at most ${ATTACHMENT_MAX_COUNT} files per message.`,
+        };
+    }
+
+    const sizeBytes = new Blob([text]).size;
+    if (sizeBytes > ATTACHMENT_MAX_TEXT_BYTES) {
+        return {
+            ok: false,
+            error: `Pasted text exceeds the ${Math.round(ATTACHMENT_MAX_TEXT_BYTES / 1024)}KB limit.`,
+        };
+    }
+
+    return {
+        ok: true,
+        attachment: {
+            id: crypto.randomUUID(),
+            kind: 'text',
+            filename: getPastedAttachmentFilename(existing),
+            mediaType: 'text/plain',
+            sizeBytes,
+            text,
+        },
+    };
+}
+
 export type ReadFilesResult = {
     attachments: PendingAttachment[];
     errors: string[];
diff --git a/packages/web/src/features/chat/components/chatBox/chatBox.tsx b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
index e8a624cb8..9ec5ffd73 100644
--- a/packages/web/src/features/chat/components/chatBox/chatBox.tsx
+++ b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
@@ -5,7 +5,7 @@ import { Button } from "@/components/ui/button";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { AttachmentData, CustomEditor, MentionElement, RenderElementPropsFor, SearchScope } from "@/features/chat/types";
 import { insertMention, slateContentToString } from "@/features/chat/utils";
-import { PendingAttachment, readFilesAsAttachments, toAttachmentData } from "@/features/chat/attachmentUtils";
+import { createPastedTextAttachment, PendingAttachment, readFilesAsAttachments, shouldAutoConvertPaste, toAttachmentData } from "@/features/chat/attachmentUtils";
 import { AttachmentButton } from "./attachmentButton";
 import { AttachmentTray } from "./attachmentTray";
 import { cn } from "@/lib/utils";
@@ -14,7 +14,7 @@ import { computePosition, flip, offset, shift, VirtualElement } from "@floating-
 import { ArrowUp, Loader2, StopCircleIcon } from "lucide-react";
 import { forwardRef, Fragment, KeyboardEvent, memo, Ref, useCallback, useEffect, useImperativeHandle, useMemo, useRef, useState } from "react";
 import { useHotkeys } from "react-hotkeys-hook";
-import { Descendant, insertText } from "slate";
+import { Descendant, Editor, insertText, Transforms } from "slate";
 import { Editable, ReactEditor, RenderElementProps, RenderLeafProps, useFocused, useSelected, useSlate } from "slate-react";
 import { useSelectedLanguageModel } from "../../useSelectedLanguageModel";
 import { SuggestionBox } from "./suggestionsBox";
@@ -22,6 +22,7 @@ import { Suggestion } from "./types";
 import { useSuggestionModeAndQuery } from "./useSuggestionModeAndQuery";
 import { useSuggestionsData } from "./useSuggestionsData";
 import { useToast } from "@/components/hooks/use-toast";
+import { ToastAction } from "@/components/ui/toast";
 import { SearchContextQuery } from "@/lib/types";
 import isEqual from "fast-deep-equal/react";
 import { LoginDialog } from "./loginDialog";
@@ -89,12 +90,70 @@ const ChatBoxComponent = ({
     });
     const { selectedLanguageModel } = useSelectedLanguageModel();
     const { toast } = useToast();
+    const isMac = useIsMac();
     const isAskEnabled = useHasEntitlement('ask');
     const [isLoginDialogOpen, setIsLoginDialogOpen] = useState<boolean>(false);
     const [isUpsellDialogOpen, setIsUpsellDialogOpen] = useState<boolean>(false);
     const [attachments, setAttachments] = useState<PendingAttachment[]>([]);
     const pathname = usePathname();
 
+    // Set when the user triggers a paste with the OS raw-paste chord
+    // (⌘⇧V / Ctrl+Shift+V). The subsequent `paste` event reads (and clears)
+    // this so the large-paste auto-conversion is skipped for that one paste.
+    const rawPasteRequestedRef = useRef<boolean>(false);
+
+    // Inserts text at the current selection, falling back to the end of the
+    // document if the editor has no selection (e.g. focus was lost after a
+    // toast action).
+    const insertTextInline = useCallback((text: string) => {
+        ReactEditor.focus(editor);
+        if (!editor.selection) {
+            Transforms.select(editor, Editor.end(editor, []));
+        }
+        insertText(editor, text);
+    }, [editor]);
+
+    const onAddPastedText = useCallback((text: string) => {
+        const result = createPastedTextAttachment(text, attachments);
+        if (!result.ok) {
+            toast({
+                description: `⚠️ ${result.error}`,
+                variant: "destructive",
+            });
+            // Don't lose the user's paste: fall back to inserting it inline.
+            insertTextInline(text);
+            return;
+        }
+
+        const { attachment } = result;
+        setAttachments((prev) => [...prev, attachment]);
+
+        toast({
+            title: "Added your paste as an attachment",
+            duration: 10 * 1000,
+            className: "w-fit ml-auto",
+            description: (
+                <div className="mt-2 flex flex-col gap-1.5">
+                    <ToastAction
+                        altText="Insert the pasted text inline instead"
+                        className="w-full justify-center"
+                        onClick={() => {
+                            setAttachments((prev) => prev.filter((item) => item.id !== attachment.id));
+                            insertTextInline(text);
+                        }}
+                    >
+                        Insert inline instead
+                    </ToastAction>
+                    <span className="text-xs text-muted-foreground">
+                        {`Tip: paste with ${isMac ? "⌘⇧V" : "Ctrl+Shift+V"} to insert inline`}
+                    </span>
+                </div>
+            ),
+        });
+
+        ReactEditor.focus(editor);
+    }, [attachments, editor, toast, isMac, insertTextInline]);
+
     const onAddFiles = useCallback(async (files: File[]) => {
         if (files.length === 0) {
             return;
@@ -317,6 +376,16 @@ const ChatBoxComponent = ({
     }, [editor, range]);
 
     const onKeyDown = useCallback((event: KeyboardEvent<HTMLDivElement>) => {
+        // Detect the OS raw-paste chord so the upcoming `paste` event can skip
+        // the large-paste auto-conversion and insert inline instead.
+        if (
+            (event.key === 'v' || event.key === 'V') &&
+            event.shiftKey &&
+            (isMac ? event.metaKey : event.ctrlKey)
+        ) {
+            rawPasteRequestedRef.current = true;
+        }
+
         if (suggestionMode === "none") {
             switch (event.key) {
                 case 'Enter': {
@@ -363,7 +432,7 @@ const ChatBoxComponent = ({
                 }
             }
         }
-    }, [suggestionMode, suggestions, onSubmit, editor, index, onInsertSuggestion]);
+    }, [suggestionMode, suggestions, onSubmit, editor, index, onInsertSuggestion, isMac]);
 
     useEffect(() => {
         if (!range || !suggestionsBoxRef.current) {
@@ -422,11 +491,30 @@ const ChatBoxComponent = ({
                     onKeyDown={onKeyDown}
                     readOnly={isDisabled}
                     onPaste={(event) => {
-                        const files = event.clipboardData?.files ? Array.from(event.clipboardData.files) : [];
+                        const clipboardData = event.clipboardData;
+                        const files = clipboardData?.files ? Array.from(clipboardData.files) : [];
                         if (files.length > 0) {
                             event.preventDefault();
                             void onAddFiles(files);
+                            return;
+                        }
+
+                        // A raw-paste chord (⌘⇧V / Ctrl+Shift+V) bypasses
+                        // auto-conversion for this one paste. Consume the flag
+                        // regardless so it never leaks into the next paste.
+                        const rawPasteRequested = rawPasteRequestedRef.current;
+                        rawPasteRequestedRef.current = false;
+                        if (rawPasteRequested) {
+                            return;
                         }
+
+                        const text = clipboardData?.getData('text/plain') ?? '';
+                        if (!shouldAutoConvertPaste(text)) {
+                            return;
+                        }
+
+                        event.preventDefault();
+                        onAddPastedText(text);
                     }}
                 />
                 <div className="flex flex-row items-center justify-end gap-1 z-10">
diff --git a/packages/web/src/features/chat/constants.ts b/packages/web/src/features/chat/constants.ts
index 4036e8c80..95b89e26e 100644
--- a/packages/web/src/features/chat/constants.ts
+++ b/packages/web/src/features/chat/constants.ts
@@ -20,6 +20,11 @@ export const ATTACHMENT_MAX_TEXT_BYTES = 256 * 1024; // 256KB per file
 export const ATTACHMENT_MAX_COUNT = 5; // per message
 export const ATTACHMENT_MAX_FILENAME_LENGTH = 200; // characters
 
+// A plain-text paste at or above either of these thresholds is automatically
+// converted into a text attachment instead of being inserted inline
+export const ATTACHMENT_PASTE_AUTO_CONVERT_MIN_CHARS = 1500;
+export const ATTACHMENT_PASTE_AUTO_CONVERT_MIN_LINES = 15;
+
 // Allowlist for inline-text attachments. Files are accepted if their MIME type
 // starts with `text/`, exactly matches an entry here, or their extension is in
 // ATTACHMENT_ALLOWED_TEXT_EXTENSIONS. Many code files report an empty MIME type

From bf792601ee6a0da559aed28d354812bc18bc8f5c Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 12:57:45 -0700
Subject: [PATCH 11/19] stronger typing for contract

---
 packages/web/src/features/chat/types.ts | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index 50792dcb8..659551d4f 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -209,15 +209,18 @@ type _AssertAllProviders = LanguageModelProvider extends typeof languageModelPro
 const _assertAllProviders: _AssertAllProviders = true;
 void _assertAllProviders;
 
-export type InputModality = 'text' | 'image' | 'audio' | 'video';
-export type DocumentType = 'pdf';
+export const inputModalities = ['text', 'image', 'audio', 'video'] as const;
+export type InputModality = typeof inputModalities[number];
+
+export const documentTypes = ['pdf'] as const;
+export type DocumentType = typeof documentTypes[number];
 
 export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
-    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept (images, audio, video, text). Single-medium attachments are gated by these. Defaults to text-only."),
-    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("Rich compound document formats (e.g. PDF) the model can ingest natively, distinct from single-medium attachments gated by inputModalities. Defaults to none."),
+    inputModalities: z.array(z.enum(inputModalities)).default(['text']).describe("The input modalities the model can accept (images, audio, video, text). Single-medium attachments are gated by these. Defaults to text-only."),
+    supportedDocumentTypes: z.array(z.enum(documentTypes)).default([]).describe("Rich compound document formats (e.g. PDF) the model can ingest natively, distinct from single-medium attachments gated by inputModalities. Defaults to none."),
 });
 
 /**

From dbcfc8a6036429193194705352194fed248ceb79 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 13:28:33 -0700
Subject: [PATCH 12/19] remove blocking models.dev catalog request and add
 cache warm on startup

---
 .../chat/modelCapabilities.server.test.ts     | 17 ++++++++--
 .../features/chat/modelsDevCatalog.server.ts  | 32 +++++++++++--------
 packages/web/src/initialize.ts                | 12 +++++++
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/packages/web/src/features/chat/modelCapabilities.server.test.ts b/packages/web/src/features/chat/modelCapabilities.server.test.ts
index 0a2e9a2ec..4cd4121bf 100644
--- a/packages/web/src/features/chat/modelCapabilities.server.test.ts
+++ b/packages/web/src/features/chat/modelCapabilities.server.test.ts
@@ -104,17 +104,28 @@ describe('resolveModelCapabilities', () => {
         vi.unstubAllGlobals();
     });
 
-    test('fetches the catalog once and resolves capabilities (incl. provider mapping)', async () => {
+    test('fetches the catalog once in the background and resolves capabilities (incl. provider mapping)', async () => {
         const fetchMock = vi.fn(async () => ({
             ok: true,
             json: async () => catalog,
         }) as unknown as Response);
         vi.stubGlobal('fetch', fetchMock);
 
+        // The request path never blocks on the fetch: the first lookup kicks off
+        // the background fetch and falls back to text-only while it's in flight.
         expect(await resolveModelCapabilities(model('anthropic', 'claude-sonnet-4-5'))).toEqual({
-            inputModalities: ['text', 'image'],
-            supportedDocumentTypes: ['pdf'],
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
         });
+
+        // Once the background fetch settles, lookups resolve from the cached catalog.
+        await vi.waitFor(async () => {
+            expect(await resolveModelCapabilities(model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+                inputModalities: ['text', 'image'],
+                supportedDocumentTypes: ['pdf'],
+            });
+        });
+
         // Subsequent lookups reuse the cached catalog rather than refetching.
         expect(await resolveModelCapabilities(model('google-generative-ai', 'gemini-2.5-pro'))).toEqual({
             inputModalities: ['text', 'image', 'audio', 'video'],
diff --git a/packages/web/src/features/chat/modelsDevCatalog.server.ts b/packages/web/src/features/chat/modelsDevCatalog.server.ts
index 8f6b35043..f2344b6f7 100644
--- a/packages/web/src/features/chat/modelsDevCatalog.server.ts
+++ b/packages/web/src/features/chat/modelsDevCatalog.server.ts
@@ -13,8 +13,10 @@ const FETCH_TIMEOUT_MS = 8000;
 // Re-fetch the (~2.4 MB) catalog at most once per this interval per server
 // process. New models trickle in daily; a stale window for a few hours is fine.
 const CATALOG_TTL_MS = 6 * 60 * 60 * 1000;
-// After a failed fetch, don't reattempt for this long. Without it, an outage in
-// models.dev would make every chat send pay the fetch timeout on the request path.
+// After a failed fetch, don't reattempt for this long. Since the request path
+// never blocks on the fetch (see loadCatalog), this throttles background
+// refresh attempts to once per interval during a models.dev outage instead of
+// kicking one off on (nearly) every request.
 const NEGATIVE_CACHE_MS = 60 * 1000;
 
 // Sourcebot provider id -> models.dev top-level catalog key. Only providers
@@ -75,9 +77,16 @@ const fetchCatalog = async (): Promise<ModelsDevCatalog | null> => {
 
 /**
  * Returns the cached models.dev catalog, refreshing it in the background when
- * stale. Only the very first load blocks on the network; thereafter the
- * last-known-good catalog is served immediately (even if stale) so the request
- * path never waits on models.dev.
+ * stale. The request path NEVER blocks on the network: the last-known-good
+ * catalog is returned immediately (even if stale), or null before the first
+ * successful fetch lands, and any refresh settles in the background.
+ *
+ * Consequences of never awaiting:
+ * - For the brief window after a cold start (before the first fetch resolves),
+ *   capability resolution falls back to text-only; it self-heals on the next
+ *   request once the background fetch populates the cache.
+ * - An unreachable catalog (e.g. an airgapped deployment) costs nothing on the
+ *   request path instead of repeatedly paying the fetch timeout.
  */
 export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
     const now = Date.now();
@@ -87,7 +96,8 @@ export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
     // Kick off a (deduped) refresh when the cache is stale/empty and we're not
     // within the post-failure backoff window. On success it replaces the cache;
     // on failure it only records the failure time, leaving the last-known-good
-    // catalog intact.
+    // catalog intact. The promise is intentionally not awaited here so the
+    // request path never waits on models.dev.
     if (!isFresh && !isBackingOff && !inFlightFetch) {
         inFlightFetch = fetchCatalog().then((catalog) => {
             if (catalog) {
@@ -101,11 +111,7 @@ export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
         });
     }
 
-    // Once a catalog has loaded once, never block the request path on the
-    // network: serve the last-known-good value (even if stale) and let any
-    // refresh settle in the background. Only the very first load awaits.
-    if (cachedCatalog !== null) {
-        return cachedCatalog;
-    }
-    return inFlightFetch ?? null;
+    // Serve whatever we currently have cached (possibly null on a cold start)
+    // and let any in-flight refresh settle in the background.
+    return cachedCatalog;
 };
diff --git a/packages/web/src/initialize.ts b/packages/web/src/initialize.ts
index 0a8eb90f9..a63581ad1 100644
--- a/packages/web/src/initialize.ts
+++ b/packages/web/src/initialize.ts
@@ -4,6 +4,8 @@ import { startChangelogPollingJob } from '@/features/changelog/pollChangelog';
 import { createLogger, env } from "@sourcebot/shared";
 import { hasEntitlement } from '@/lib/entitlements';
 import { SINGLE_TENANT_ORG_ID } from './lib/constants';
+import { getConfiguredLanguageModels } from '@/features/chat/utils.server';
+import { loadCatalog } from '@/features/chat/modelsDevCatalog.server';
 
 const logger = createLogger('web-initialize');
 
@@ -73,8 +75,18 @@ const init = async () => {
     }
 }
 
+const warmModelCapabilitiesCatalog = async () => {
+    const configuredModels = await getConfiguredLanguageModels();
+    if (configuredModels.length === 0) {
+        return;
+    }
+    logger.info(`Warming models.dev capability catalog for ${configuredModels.length} configured language model(s)`);
+    void loadCatalog();
+};
+
 (async () => {
     await init();
     startServicePingCronJob();
     startChangelogPollingJob();
+    await warmModelCapabilitiesCatalog();
 })();

From 7ba297bca6208a1f0cf26e5b6b4d19dd5d91886c Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 13:45:34 -0700
Subject: [PATCH 13/19] cleanup warming

---
 .../features/chat/modelContextWindow.test.ts  | 13 ++++++++--
 .../web/src/features/chat/utils.server.ts     | 26 ++++++++++++++++++-
 packages/web/src/initialize.ts                | 14 ++--------
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/packages/web/src/features/chat/modelContextWindow.test.ts b/packages/web/src/features/chat/modelContextWindow.test.ts
index 9476820ae..818251a3f 100644
--- a/packages/web/src/features/chat/modelContextWindow.test.ts
+++ b/packages/web/src/features/chat/modelContextWindow.test.ts
@@ -81,13 +81,19 @@ describe('resolveContextWindow', () => {
         vi.unstubAllGlobals();
     });
 
-    test('fetches the catalog once and resolves windows (incl. provider mapping)', async () => {
+    test('fetches the catalog once in the background and resolves windows (incl. provider mapping)', async () => {
         const fetchMock = vi.fn(async () => ({
             ok: true,
             json: async () => catalog,
         }) as unknown as Response);
         vi.stubGlobal('fetch', fetchMock);
 
+        // The request path never blocks on the fetch: the first lookup kicks off
+        // the background fetch and falls back to "unknown" while it's in flight.
+        expect(await resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBeUndefined();
+
+        // Once the background fetch settles, lookups resolve from the cached catalog.
+        await new Promise((resolve) => setTimeout(resolve, 0));
         expect(await resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000);
         // Subsequent lookups reuse the cached catalog rather than refetching.
         expect(await resolveContextWindow(model('google-generative-ai', 'gemini-2.5-pro'))).toBe(1048576);
@@ -141,7 +147,10 @@ describe('resolveContextWindow resilience', () => {
 
         const mod = await importFresh();
 
-        // First load populates the cache.
+        // First load kicks off the background fetch (returning the "unknown"
+        // fallback until it settles), which then populates the cache.
+        expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBeUndefined();
+        await new Promise((resolve) => setTimeout(resolve, 0));
         expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000);
         expect(fetchMock).toHaveBeenCalledTimes(1);
 
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index a458c41d8..90c83c859 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -3,16 +3,19 @@ import 'server-only';
 import { getAnonymousId } from '@/lib/anonymousId';
 import { Chat, Prisma, PrismaClient, User } from '@sourcebot/db';
 import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { env, loadConfig } from '@sourcebot/shared';
+import { createLogger, env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
 import { resolveModelCapabilities } from './modelCapabilities.server';
+import { loadCatalog } from './modelsDevCatalog.server';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
 import { StatusCodes } from 'http-status-codes';
 
+const logger = createLogger('chat-utils');
+
 /**
  * Returns a FORBIDDEN ServiceError when the deployment lacks the `ask`
  * entitlement, or null when Ask is available. Gates the generative chat
@@ -139,3 +142,24 @@ export const getConfiguredLanguageModelsInfo = async () => {
         };
     }));
 };
+
+/**
+ * Eagerly warms the models.dev capability catalog at server startup so the first
+ * request after a cold start resolves real model capabilities instead of the
+ * text-only fallback. No-op when no language models are configured (avoids a
+ * gratuitous outbound call for deployments not using Ask). Best-effort and
+ * non-blocking: loadCatalog kicks off a background fetch and returns immediately,
+ * and any unexpected error is logged rather than surfaced.
+ */
+export const warmModelCapabilitiesCatalog = (): void => {
+    void (async () => {
+        const configuredModels = await getConfiguredLanguageModels();
+        if (configuredModels.length === 0) {
+            return;
+        }
+        logger.info(`Warming models.dev capability catalog for ${configuredModels.length} configured language model(s)`);
+        void loadCatalog();
+    })().catch((error) => {
+        logger.error(`Failed to warm models.dev capability catalog: ${error}`);
+    });
+};
diff --git a/packages/web/src/initialize.ts b/packages/web/src/initialize.ts
index a63581ad1..406116dee 100644
--- a/packages/web/src/initialize.ts
+++ b/packages/web/src/initialize.ts
@@ -4,8 +4,7 @@ import { startChangelogPollingJob } from '@/features/changelog/pollChangelog';
 import { createLogger, env } from "@sourcebot/shared";
 import { hasEntitlement } from '@/lib/entitlements';
 import { SINGLE_TENANT_ORG_ID } from './lib/constants';
-import { getConfiguredLanguageModels } from '@/features/chat/utils.server';
-import { loadCatalog } from '@/features/chat/modelsDevCatalog.server';
+import { warmModelCapabilitiesCatalog } from '@/features/chat/utils.server';
 
 const logger = createLogger('web-initialize');
 
@@ -75,18 +74,9 @@ const init = async () => {
     }
 }
 
-const warmModelCapabilitiesCatalog = async () => {
-    const configuredModels = await getConfiguredLanguageModels();
-    if (configuredModels.length === 0) {
-        return;
-    }
-    logger.info(`Warming models.dev capability catalog for ${configuredModels.length} configured language model(s)`);
-    void loadCatalog();
-};
-
 (async () => {
     await init();
     startServicePingCronJob();
     startChangelogPollingJob();
-    await warmModelCapabilitiesCatalog();
+    warmModelCapabilitiesCatalog();
 })();

From 070f8322844b01fcc4b74c8eb4d28a5668257e9f Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 13:55:26 -0700
Subject: [PATCH 14/19] remove button from toast

---
 .../chat/components/chatBox/chatBox.tsx       | 23 +++----------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/packages/web/src/features/chat/components/chatBox/chatBox.tsx b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
index 9ec5ffd73..a6212cc08 100644
--- a/packages/web/src/features/chat/components/chatBox/chatBox.tsx
+++ b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
@@ -22,7 +22,6 @@ import { Suggestion } from "./types";
 import { useSuggestionModeAndQuery } from "./useSuggestionModeAndQuery";
 import { useSuggestionsData } from "./useSuggestionsData";
 import { useToast } from "@/components/hooks/use-toast";
-import { ToastAction } from "@/components/ui/toast";
 import { SearchContextQuery } from "@/lib/types";
 import isEqual from "fast-deep-equal/react";
 import { LoginDialog } from "./loginDialog";
@@ -129,26 +128,10 @@ const ChatBoxComponent = ({
         setAttachments((prev) => [...prev, attachment]);
 
         toast({
-            title: "Added your paste as an attachment",
-            duration: 10 * 1000,
+            title: "Large paste added as an attachment",
+            duration: 5 * 1000,
             className: "w-fit ml-auto",
-            description: (
-                <div className="mt-2 flex flex-col gap-1.5">
-                    <ToastAction
-                        altText="Insert the pasted text inline instead"
-                        className="w-full justify-center"
-                        onClick={() => {
-                            setAttachments((prev) => prev.filter((item) => item.id !== attachment.id));
-                            insertTextInline(text);
-                        }}
-                    >
-                        Insert inline instead
-                    </ToastAction>
-                    <span className="text-xs text-muted-foreground">
-                        {`Tip: paste with ${isMac ? "⌘⇧V" : "Ctrl+Shift+V"} to insert inline`}
-                    </span>
-                </div>
-            ),
+            description: `Use ${isMac ? "⌘+⇧+V" : "Ctrl+Shift+V"} to paste inline instead`,
         });
 
         ReactEditor.focus(editor);

From a62a25bbd2f92e2f702aa2cd6f7f441c2906858a Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 14:05:34 -0700
Subject: [PATCH 15/19] add separate state to track pending attachments to
 avoid visual flicker of clear-on-submit

---
 .../web/src/features/chat/components/chatBox/chatBox.tsx    | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/web/src/features/chat/components/chatBox/chatBox.tsx b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
index a6212cc08..38ee39ee1 100644
--- a/packages/web/src/features/chat/components/chatBox/chatBox.tsx
+++ b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
@@ -94,6 +94,7 @@ const ChatBoxComponent = ({
     const [isLoginDialogOpen, setIsLoginDialogOpen] = useState<boolean>(false);
     const [isUpsellDialogOpen, setIsUpsellDialogOpen] = useState<boolean>(false);
     const [attachments, setAttachments] = useState<PendingAttachment[]>([]);
+    const [submittedAttachments, setSubmittedAttachments] = useState<PendingAttachment[]>([]);
     const pathname = usePathname();
 
     // Set when the user triggers a paste with the OS raw-paste chord
@@ -285,6 +286,7 @@ const ChatBoxComponent = ({
         }
 
         _onSubmit(editor.children, editor, attachments.map(toAttachmentData));
+        setSubmittedAttachments(attachments);
         setAttachments([]);
     }, [
         isSubmitDisabled,
@@ -459,9 +461,9 @@ const ChatBoxComponent = ({
             <div
                 className={cn("flex flex-col justify-between gap-0.5 w-full px-3 py-2", className)}
             >
-                {attachments.length > 0 && (
+                {(isRedirecting ? submittedAttachments : attachments).length > 0 && (
                     <AttachmentTray
-                        attachments={attachments}
+                        attachments={isRedirecting ? submittedAttachments : attachments}
                         onRemove={removeAttachment}
                         className="mb-1.5"
                     />

From af8a9db8db3ab11c2727a94b9e2b4584f6e54fda Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 14:14:54 -0700
Subject: [PATCH 16/19] explicitly import already hoisted uuid and change bad
 crypto call for uuid use

---
 packages/web/package.json                         | 1 +
 packages/web/src/features/chat/attachmentUtils.ts | 5 +++--
 yarn.lock                                         | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/packages/web/package.json b/packages/web/package.json
index be934fbe4..61f1dade0 100644
--- a/packages/web/package.json
+++ b/packages/web/package.json
@@ -201,6 +201,7 @@
     "tailwindcss-animate": "^1.0.7",
     "use-stick-to-bottom": "^1.1.3",
     "usehooks-ts": "^3.1.0",
+    "uuid": "^14.0.0",
     "vscode-icons-js": "^11.6.1",
     "zod": "^3.25.76",
     "zod-to-json-schema": "^3.24.5"
diff --git a/packages/web/src/features/chat/attachmentUtils.ts b/packages/web/src/features/chat/attachmentUtils.ts
index ed04fe278..7ff9fd2f6 100644
--- a/packages/web/src/features/chat/attachmentUtils.ts
+++ b/packages/web/src/features/chat/attachmentUtils.ts
@@ -10,6 +10,7 @@ import {
     ATTACHMENT_PASTE_AUTO_CONVERT_MIN_LINES,
 } from "./constants";
 import { AttachmentData, TextAttachment } from "./types";
+import { v4 as uuidv4 } from "uuid";
 
 // Normalizes an untrusted filename: keeps only the basename, drops control
 // characters (which could break the prompt's `<attachment filename="...">` tag
@@ -175,7 +176,7 @@ export const createPastedTextAttachment = (
     return {
         ok: true,
         attachment: {
-            id: crypto.randomUUID(),
+            id: uuidv4(),
             kind: 'text',
             filename: getPastedAttachmentFilename(existing),
             mediaType: 'text/plain',
@@ -220,7 +221,7 @@ export const readFilesAsAttachments = async (
         try {
             const text = await readAsText(file);
             attachments.push({
-                id: crypto.randomUUID(),
+                id: uuidv4(),
                 kind: 'text',
                 filename: sanitizeFilename(file.name),
                 mediaType: file.type || 'text/plain',
diff --git a/yarn.lock b/yarn.lock
index a75d9eb9d..c0c1cb7ad 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -9543,6 +9543,7 @@ __metadata:
     typescript-eslint: "npm:^8.56.1"
     use-stick-to-bottom: "npm:^1.1.3"
     usehooks-ts: "npm:^3.1.0"
+    uuid: "npm:^14.0.0"
     vite-tsconfig-paths: "npm:^5.1.3"
     vitest: "npm:^4.1.4"
     vitest-mock-extended: "npm:^4.0.0"

From 2222dba6ec772f53e2dc1668b09c6fa40cae1f45 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 14:14:59 -0700
Subject: [PATCH 17/19] add changelog entry

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b0725af08..e14eb5746 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [EE] Added mermaid diagram rendering to Ask Sourcebot answers, with pan/zoom, copy/export, in-thread deep links, and an interleaved right-panel view. [#1369](https://github.com/sourcebot-dev/sourcebot/pull/1369)
 - [EE] Added a context-window usage gauge to the Ask Sourcebot chat details, showing how much of the selected model's context window each turn occupies. Window sizes are resolved from the models.dev catalog. [#1370](https://github.com/sourcebot-dev/sourcebot/pull/1370)
 - Added language model input-modality and document capability resolution, automatically resolved from the models.dev catalog (falls back to text-only for uncatalogued/self-hosted models). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
+- [EE] Added text file attachments to Ask Sourcebot, letting users attach text/code/config files to a chat message via the paperclip button, drag-and-drop, or paste, with large pastes auto-converted to attachments. [#1374](https://github.com/sourcebot-dev/sourcebot/pull/1374)
 
 ### Fixed
 - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367)

From 14a237d2719bba6163764c4f5ba851a01302cb55 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 15:15:06 -0700
Subject: [PATCH 18/19] remove granular file attachment controls in favor of
 single per-message cap of around 60-80k tokens

---
 packages/web/src/ee/features/chat/agent.ts    |   8 +-
 .../web/src/features/chat/attachmentUtils.ts  | 100 ++++++------------
 .../components/chatBox/attachmentTray.tsx     |  22 ++--
 .../chat/components/chatBox/chatBox.tsx       |  85 ++++++++-------
 packages/web/src/features/chat/constants.ts   |  10 +-
 packages/web/src/features/chat/utils.ts       |  30 +++---
 6 files changed, 115 insertions(+), 140 deletions(-)

diff --git a/packages/web/src/ee/features/chat/agent.ts b/packages/web/src/ee/features/chat/agent.ts
index df95c52b0..e6cc49ca3 100644
--- a/packages/web/src/ee/features/chat/agent.ts
+++ b/packages/web/src/ee/features/chat/agent.ts
@@ -23,7 +23,6 @@ import _dedent from "dedent";
 import { ANSWER_TAG, FILE_REFERENCE_PREFIX } from "@/features/chat/constants";
 import { Source } from "@/features/chat/types";
 import { addLineNumbers, fileReferenceToString, formatAttachmentsForPrompt, getAnswerPartFromAssistantMessage, getTurnProgressState, getUserMessageAttachments, getUserMessageText } from "@/features/chat/utils";
-import { ATTACHMENT_MAX_TEXT_BYTES } from "@/features/chat/constants";
 import { createTools } from "./tools";
 import { getConnectedMcpClients } from "@/ee/features/chat/mcp/mcpClientFactory";
 import { getMcpTools, McpToolsResult } from "@/ee/features/chat/mcp/mcpToolSets";
@@ -106,13 +105,12 @@ export const createMessageStream = async ({
     let messageHistory: ModelMessage[] =
         messages.map((message, index): ModelMessage | undefined => {
             if (message.role === 'user') {
-                // Fold any inline-text attachments into this turn's content (not
-                // the system prompt) so they stay bound to the turn they were
-                // attached to and are re-emitted per turn from the persisted parts.
+                // Fold inline-text attachments into this turn's content (not the
+                // system prompt) so they stay bound to their turn, re-emitted from
+                // the persisted parts.
                 const text = getUserMessageText(message);
                 const attachmentsBlock = formatAttachmentsForPrompt(
                     getUserMessageAttachments(message),
-                    ATTACHMENT_MAX_TEXT_BYTES,
                 );
                 return {
                     role: 'user',
diff --git a/packages/web/src/features/chat/attachmentUtils.ts b/packages/web/src/features/chat/attachmentUtils.ts
index 7ff9fd2f6..4ab042532 100644
--- a/packages/web/src/features/chat/attachmentUtils.ts
+++ b/packages/web/src/features/chat/attachmentUtils.ts
@@ -3,22 +3,18 @@
 import {
     ATTACHMENT_ALLOWED_TEXT_EXTENSIONS,
     ATTACHMENT_ALLOWED_TEXT_MIME_TYPES,
-    ATTACHMENT_MAX_COUNT,
-    ATTACHMENT_MAX_FILENAME_LENGTH,
-    ATTACHMENT_MAX_TEXT_BYTES,
+    ATTACHMENT_MAX_TURN_TEXT_BYTES,
     ATTACHMENT_PASTE_AUTO_CONVERT_MIN_CHARS,
     ATTACHMENT_PASTE_AUTO_CONVERT_MIN_LINES,
 } from "./constants";
 import { AttachmentData, TextAttachment } from "./types";
 import { v4 as uuidv4 } from "uuid";
 
-// Normalizes an untrusted filename: keeps only the basename, drops control
-// characters (which could break the prompt's `<attachment filename="...">` tag
-// or the UI), collapses whitespace, and caps the length while preserving the
-// extension. Long/abusive names are truncated rather than rejected.
+// Normalizes an untrusted filename: basename only, strips control chars (which
+// could break the `<attachment filename="...">` tag or UI), collapses whitespace.
 export const sanitizeFilename = (name: string): string => {
     const basename = name.split(/[\\/]/).pop() ?? name;
-    const cleaned = Array.from(basename)
+    return Array.from(basename)
         .filter((char) => {
             const code = char.charCodeAt(0);
             return code >= 32 && code !== 127;
@@ -26,16 +22,6 @@ export const sanitizeFilename = (name: string): string => {
         .join('')
         .replace(/\s+/g, ' ')
         .trim() || 'attachment';
-
-    if (cleaned.length <= ATTACHMENT_MAX_FILENAME_LENGTH) {
-        return cleaned;
-    }
-
-    const dotIndex = cleaned.lastIndexOf('.');
-    const extension = dotIndex > 0 ? cleaned.slice(dotIndex) : '';
-    const stem = dotIndex > 0 ? cleaned.slice(0, dotIndex) : cleaned;
-    const keep = Math.max(1, ATTACHMENT_MAX_FILENAME_LENGTH - extension.length - 1);
-    return `${stem.slice(0, keep)}…${extension}`;
 }
 
 // A text attachment selected in the chat box but not yet submitted. The `id`
@@ -53,10 +39,8 @@ export const getAttachmentAcceptAttribute = (): string => {
     ].join(',');
 }
 
-// Builds the `accept` map for react-dropzone (and the native file picker) so
-// the OS dialog and drag overlay only surface supported text file types. The
-// extension list is attached to `text/plain` so code files that report an empty
-// or unusual MIME type are still selectable by extension.
+// Builds react-dropzone's `accept` map. Extensions are attached to `text/plain`
+// so code files that report an empty/unusual MIME type are still selectable.
 export const getAttachmentDropzoneAccept = (): Record<string, string[]> => {
     const accept: Record<string, string[]> = {
         'text/*': [],
@@ -68,6 +52,14 @@ export const getAttachmentDropzoneAccept = (): Record<string, string[]> => {
     return accept;
 }
 
+// Total UTF-8 byte size of a turn's submitted text (prompt + attachment bodies),
+// checked against ATTACHMENT_MAX_TURN_TEXT_BYTES at submit time.
+export const getSubmittedTextBytes = (text: string, attachments: PendingAttachment[]): number => {
+    const textBytes = new TextEncoder().encode(text).length;
+    const attachmentBytes = attachments.reduce((sum, attachment) => sum + attachment.sizeBytes, 0);
+    return textBytes + attachmentBytes;
+}
+
 export const toAttachmentData = (attachment: PendingAttachment): AttachmentData => {
     return {
         kind: attachment.kind,
@@ -115,9 +107,8 @@ const readAsText = (file: File): Promise<string> => {
     });
 }
 
-// Whether a plain-text paste is "large" enough to be automatically converted
-// into a text attachment rather than inserted inline. Gated on both length and
-// shape so a single long sentence isn't swept up, but a multi-line snippet is.
+// Whether a plain-text paste is large enough to auto-convert into an attachment
+// instead of being inserted inline. Gated on length or line count.
 export const shouldAutoConvertPaste = (text: string): boolean => {
     if (text.length >= ATTACHMENT_PASTE_AUTO_CONVERT_MIN_CHARS) {
         return true;
@@ -147,42 +138,19 @@ const getPastedAttachmentFilename = (existing: PendingAttachment[]): string => {
     return `pasted-${index}.txt`;
 }
 
-export type CreatePastedAttachmentResult =
-    | { ok: true; attachment: PendingAttachment }
-    | { ok: false; error: string };
-
-// Builds a pending text attachment from a pasted string, enforcing the same
-// per-message count and per-attachment size caps as file attachments. Returns
-// a human-readable error instead of throwing when a cap is exceeded.
+// Builds a pending text attachment from a pasted string. The per-turn text
+// budget is enforced once at submit time, not here, so this can't fail.
 export const createPastedTextAttachment = (
     text: string,
     existing: PendingAttachment[],
-): CreatePastedAttachmentResult => {
-    if (existing.length >= ATTACHMENT_MAX_COUNT) {
-        return {
-            ok: false,
-            error: `You can attach at most ${ATTACHMENT_MAX_COUNT} files per message.`,
-        };
-    }
-
-    const sizeBytes = new Blob([text]).size;
-    if (sizeBytes > ATTACHMENT_MAX_TEXT_BYTES) {
-        return {
-            ok: false,
-            error: `Pasted text exceeds the ${Math.round(ATTACHMENT_MAX_TEXT_BYTES / 1024)}KB limit.`,
-        };
-    }
-
+): PendingAttachment => {
     return {
-        ok: true,
-        attachment: {
-            id: uuidv4(),
-            kind: 'text',
-            filename: getPastedAttachmentFilename(existing),
-            mediaType: 'text/plain',
-            sizeBytes,
-            text,
-        },
+        id: uuidv4(),
+        kind: 'text',
+        filename: getPastedAttachmentFilename(existing),
+        mediaType: 'text/plain',
+        sizeBytes: new Blob([text]).size,
+        text,
     };
 }
 
@@ -191,30 +159,23 @@ export type ReadFilesResult = {
     errors: string[];
 };
 
-// Reads and validates a set of files into pending text attachments, enforcing
-// the per-message count, per-file size, and allowed-type caps. Rejected files
-// produce a human-readable error message instead of throwing.
+// Reads files into pending text attachments, rejecting non-text files and any
+// file larger than the per-turn budget (skipped before reading to avoid loading
+// a huge file into memory). The aggregate budget is enforced at submit time.
 export const readFilesAsAttachments = async (
     files: File[],
-    existingCount: number,
 ): Promise<ReadFilesResult> => {
     const attachments: PendingAttachment[] = [];
     const errors: string[] = [];
-    let count = existingCount;
 
     for (const file of files) {
-        if (count >= ATTACHMENT_MAX_COUNT) {
-            errors.push(`You can attach at most ${ATTACHMENT_MAX_COUNT} files per message.`);
-            break;
-        }
-
         if (!isAllowedTextFile(file)) {
             errors.push(`${file.name}: unsupported file type (text files only).`);
             continue;
         }
 
-        if (file.size > ATTACHMENT_MAX_TEXT_BYTES) {
-            errors.push(`${file.name}: exceeds the ${Math.round(ATTACHMENT_MAX_TEXT_BYTES / 1024)}KB limit.`);
+        if (file.size > ATTACHMENT_MAX_TURN_TEXT_BYTES) {
+            errors.push(`${file.name}: exceeds the ${Math.round(ATTACHMENT_MAX_TURN_TEXT_BYTES / 1024)}KB per-message limit.`);
             continue;
         }
 
@@ -228,7 +189,6 @@ export const readFilesAsAttachments = async (
                 sizeBytes: file.size,
                 text,
             });
-            count++;
         } catch {
             errors.push(`${file.name}: failed to read file.`);
         }
diff --git a/packages/web/src/features/chat/components/chatBox/attachmentTray.tsx b/packages/web/src/features/chat/components/chatBox/attachmentTray.tsx
index 090320e92..2646fa93b 100644
--- a/packages/web/src/features/chat/components/chatBox/attachmentTray.tsx
+++ b/packages/web/src/features/chat/components/chatBox/attachmentTray.tsx
@@ -9,7 +9,9 @@ import { AttachmentViewerDialog } from "./attachmentViewerDialog";
 
 interface AttachmentTrayProps {
     attachments: PendingAttachment[];
-    onRemove: (id: string) => void;
+    // Omitted when the tray is read-only (e.g. while a submission is
+    // redirecting); the remove control is hidden in that case.
+    onRemove?: (id: string) => void;
     className?: string;
 }
 
@@ -39,14 +41,16 @@ export const AttachmentTray = ({ attachments, onRemove, className }: AttachmentT
                                 {attachment.filename}
                             </span>
                         </button>
-                        <button
-                            type="button"
-                            onClick={() => onRemove(attachment.id)}
-                            className="text-muted-foreground hover:text-foreground"
-                            aria-label={`Remove ${attachment.filename}`}
-                        >
-                            <X className="w-3 h-3" />
-                        </button>
+                        {onRemove && (
+                            <button
+                                type="button"
+                                onClick={() => onRemove(attachment.id)}
+                                className="text-muted-foreground hover:text-foreground"
+                                aria-label={`Remove ${attachment.filename}`}
+                            >
+                                <X className="w-3 h-3" />
+                            </button>
+                        )}
                     </div>
                 ))}
             </div>
diff --git a/packages/web/src/features/chat/components/chatBox/chatBox.tsx b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
index 38ee39ee1..1e6bac099 100644
--- a/packages/web/src/features/chat/components/chatBox/chatBox.tsx
+++ b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
@@ -5,7 +5,7 @@ import { Button } from "@/components/ui/button";
 import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
 import { AttachmentData, CustomEditor, MentionElement, RenderElementPropsFor, SearchScope } from "@/features/chat/types";
 import { insertMention, slateContentToString } from "@/features/chat/utils";
-import { createPastedTextAttachment, PendingAttachment, readFilesAsAttachments, shouldAutoConvertPaste, toAttachmentData } from "@/features/chat/attachmentUtils";
+import { createPastedTextAttachment, getSubmittedTextBytes, PendingAttachment, readFilesAsAttachments, shouldAutoConvertPaste, toAttachmentData } from "@/features/chat/attachmentUtils";
 import { AttachmentButton } from "./attachmentButton";
 import { AttachmentTray } from "./attachmentTray";
 import { cn } from "@/lib/utils";
@@ -14,7 +14,7 @@ import { computePosition, flip, offset, shift, VirtualElement } from "@floating-
 import { ArrowUp, Loader2, StopCircleIcon } from "lucide-react";
 import { forwardRef, Fragment, KeyboardEvent, memo, Ref, useCallback, useEffect, useImperativeHandle, useMemo, useRef, useState } from "react";
 import { useHotkeys } from "react-hotkeys-hook";
-import { Descendant, Editor, insertText, Transforms } from "slate";
+import { Descendant, insertText } from "slate";
 import { Editable, ReactEditor, RenderElementProps, RenderLeafProps, useFocused, useSelected, useSlate } from "slate-react";
 import { useSelectedLanguageModel } from "../../useSelectedLanguageModel";
 import { SuggestionBox } from "./suggestionsBox";
@@ -26,7 +26,7 @@ import { SearchContextQuery } from "@/lib/types";
 import isEqual from "fast-deep-equal/react";
 import { LoginDialog } from "./loginDialog";
 import { usePathname } from "next/navigation";
-import { PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY } from "@/features/chat/constants";
+import { ATTACHMENT_MAX_TURN_TEXT_BYTES, PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY } from "@/features/chat/constants";
 import useCaptureEvent from "@/hooks/useCaptureEvent";
 import { useHasEntitlement } from "@/features/entitlements/useHasEntitlement";
 import { UpsellDialog } from "@/features/billing/upsellDialog";
@@ -102,61 +102,60 @@ const ChatBoxComponent = ({
     // this so the large-paste auto-conversion is skipped for that one paste.
     const rawPasteRequestedRef = useRef<boolean>(false);
 
-    // Inserts text at the current selection, falling back to the end of the
-    // document if the editor has no selection (e.g. focus was lost after a
-    // toast action).
-    const insertTextInline = useCallback((text: string) => {
-        ReactEditor.focus(editor);
-        if (!editor.selection) {
-            Transforms.select(editor, Editor.end(editor, []));
+    // Warning shown when prompt text + `nextAttachments` would exceed the per-turn
+    // budget, so an over-budget add surfaces immediately instead of just disabling submit.
+    const getOverBudgetWarning = useCallback((nextAttachments: PendingAttachment[]): string | null => {
+        const totalBytes = getSubmittedTextBytes(slateContentToString(editor.children), nextAttachments);
+        if (totalBytes <= ATTACHMENT_MAX_TURN_TEXT_BYTES) {
+            return null;
         }
-        insertText(editor, text);
+        return `Attachments exceed the ${Math.round(ATTACHMENT_MAX_TURN_TEXT_BYTES / 1024)}KB per-message limit. Remove a file or shorten your message to send.`;
     }, [editor]);
 
     const onAddPastedText = useCallback((text: string) => {
-        const result = createPastedTextAttachment(text, attachments);
-        if (!result.ok) {
+        const attachment = createPastedTextAttachment(text, attachments);
+        setAttachments((prev) => [...prev, attachment]);
+
+        const overBudgetWarning = getOverBudgetWarning([...attachments, attachment]);
+        if (overBudgetWarning) {
             toast({
-                description: `⚠️ ${result.error}`,
+                description: `⚠️ ${overBudgetWarning}`,
                 variant: "destructive",
             });
-            // Don't lose the user's paste: fall back to inserting it inline.
-            insertTextInline(text);
-            return;
+        } else {
+            toast({
+                title: "Large paste added as an attachment",
+                duration: 5 * 1000,
+                className: "w-fit ml-auto",
+                description: `Use ${isMac ? "⌘+⇧+V" : "Ctrl+Shift+V"} to paste inline instead`,
+            });
         }
 
-        const { attachment } = result;
-        setAttachments((prev) => [...prev, attachment]);
-
-        toast({
-            title: "Large paste added as an attachment",
-            duration: 5 * 1000,
-            className: "w-fit ml-auto",
-            description: `Use ${isMac ? "⌘+⇧+V" : "Ctrl+Shift+V"} to paste inline instead`,
-        });
-
         ReactEditor.focus(editor);
-    }, [attachments, editor, toast, isMac, insertTextInline]);
+    }, [attachments, editor, toast, isMac, getOverBudgetWarning]);
 
     const onAddFiles = useCallback(async (files: File[]) => {
         if (files.length === 0) {
             return;
         }
 
-        const { attachments: added, errors } = await readFilesAsAttachments(files, attachments.length);
+        const { attachments: added, errors } = await readFilesAsAttachments(files);
         if (added.length > 0) {
             setAttachments((prev) => [...prev, ...added]);
         }
-        if (errors.length > 0) {
+
+        const overBudgetWarning = added.length > 0 ? getOverBudgetWarning([...attachments, ...added]) : null;
+        const messages = [...errors, ...(overBudgetWarning ? [overBudgetWarning] : [])];
+        if (messages.length > 0) {
             toast({
-                description: `⚠️ ${errors.join(' ')}`,
+                description: `⚠️ ${messages.join(' ')}`,
                 variant: "destructive",
             });
         }
 
         // Return focus to the prompt input so the user can keep typing.
         ReactEditor.focus(editor);
-    }, [attachments.length, toast, editor]);
+    }, [attachments, toast, editor, getOverBudgetWarning]);
 
     const removeAttachment = useCallback((id: string) => {
         setAttachments((prev) => prev.filter((attachment) => attachment.id !== id));
@@ -201,18 +200,27 @@ const ChatBoxComponent = ({
 
     const { isSubmitDisabled, isSubmitDisabledReason } = useMemo((): {
         isSubmitDisabled: true,
-        isSubmitDisabledReason: "empty" | "redirecting" | "generating" | "no-language-model-selected"
+        isSubmitDisabledReason: "empty" | "too-large" | "redirecting" | "generating" | "no-language-model-selected"
     } | {
         isSubmitDisabled: false,
         isSubmitDisabledReason: undefined,
     } => {
-        if (slateContentToString(editor.children).trim().length === 0 && attachments.length === 0) {
+        const text = slateContentToString(editor.children);
+        if (text.trim().length === 0 && attachments.length === 0) {
             return {
                 isSubmitDisabled: true,
                 isSubmitDisabledReason: "empty",
             }
         }
 
+        // Single per-turn bound on the submitted text (prompt + attachments).
+        if (getSubmittedTextBytes(text, attachments) > ATTACHMENT_MAX_TURN_TEXT_BYTES) {
+            return {
+                isSubmitDisabled: true,
+                isSubmitDisabledReason: "too-large",
+            }
+        }
+
         if (isRedirecting) {
             return {
                 isSubmitDisabled: true,
@@ -240,7 +248,7 @@ const ChatBoxComponent = ({
             isSubmitDisabledReason: undefined,
         }
 
-    }, [editor.children, isRedirecting, isTurnInProgress, selectedLanguageModel, attachments.length])
+    }, [editor.children, isRedirecting, isTurnInProgress, selectedLanguageModel, attachments])
 
     const {
         requiresLogin,
@@ -261,6 +269,11 @@ const ChatBoxComponent = ({
                     description: "⚠️ You must select a language model",
                     variant: "destructive",
                 });
+            } else if (isSubmitDisabledReason === "too-large") {
+                toast({
+                    description: `⚠️ Message and attachments exceed the ${Math.round(ATTACHMENT_MAX_TURN_TEXT_BYTES / 1024)}KB per-message limit. Remove a file or shorten the text.`,
+                    variant: "destructive",
+                });
             }
 
             return;
@@ -464,7 +477,7 @@ const ChatBoxComponent = ({
                 {(isRedirecting ? submittedAttachments : attachments).length > 0 && (
                     <AttachmentTray
                         attachments={isRedirecting ? submittedAttachments : attachments}
-                        onRemove={removeAttachment}
+                        onRemove={isRedirecting ? undefined : removeAttachment}
                         className="mb-1.5"
                     />
                 )}
diff --git a/packages/web/src/features/chat/constants.ts b/packages/web/src/features/chat/constants.ts
index 95b89e26e..1306bb7a5 100644
--- a/packages/web/src/features/chat/constants.ts
+++ b/packages/web/src/features/chat/constants.ts
@@ -13,12 +13,10 @@ export const PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY = 'pendingChatSubmissio
 export const DISABLED_MCP_SERVER_IDS_LOCAL_STORAGE_KEY = 'disabledMcpServerIds';
 export const MCP_OAUTH_DRAFT_SESSION_STORAGE_KEY = 'mcpOAuthDraft';
 
-// Text attachment limits. Text is inlined into the message (and, for new
-// threads, into the sessionStorage stash), so caps are kept conservative to
-// bound `messages` JSON growth and stay well under the sessionStorage limit.
-export const ATTACHMENT_MAX_TEXT_BYTES = 256 * 1024; // 256KB per file
-export const ATTACHMENT_MAX_COUNT = 5; // per message
-export const ATTACHMENT_MAX_FILENAME_LENGTH = 200; // characters
+// Single upper bound on the total attachment text submitted per turn (text is
+// inlined and re-emitted every turn). ~256KB ≈ 65-85K tokens: enough for a few
+// files or a large log while leaving room for retrieval, history, and output.
+export const ATTACHMENT_MAX_TURN_TEXT_BYTES = 256 * 1024; // 256KB per turn
 
 // A plain-text paste at or above either of these thresholds is automatically
 // converted into a text attachment instead of being inserted inline
diff --git a/packages/web/src/features/chat/utils.ts b/packages/web/src/features/chat/utils.ts
index 3f6742c19..15a4907ff 100644
--- a/packages/web/src/features/chat/utils.ts
+++ b/packages/web/src/features/chat/utils.ts
@@ -2,7 +2,7 @@ import { BrowseHighlightRange, getBrowsePath } from "@/app/(app)/browse/hooks/ut
 import { CreateUIMessage, isToolUIPart, TextUIPart, UIMessagePart } from "ai";
 import type { ChatStatus, DynamicToolUIPart, ToolUIPart } from "ai";
 import { Descendant, Editor, Point, Range, Transforms } from "slate";
-import { ANSWER_TAG, ATTACHMENT_MAX_FILENAME_LENGTH, FILE_REFERENCE_PREFIX, FILE_REFERENCE_REGEX } from "./constants";
+import { ANSWER_TAG, FILE_REFERENCE_PREFIX, FILE_REFERENCE_REGEX } from "./constants";
 import {
     AttachmentData,
     CustomEditor,
@@ -417,27 +417,29 @@ export const getUserMessageAttachments = (message: Pick<SBChatMessage, 'parts'>)
         .map((part) => part.data);
 }
 
-// Formats a user message's attachments into a delimited block suitable for
-// inlining into that turn's content. Returns an empty string when there are no
-// (text) attachments. `maxBytesPerAttachment` defensively truncates each
-// attachment's text (defense-in-depth against an oversized client payload).
-export const formatAttachmentsForPrompt = (attachments: AttachmentData[], maxBytesPerAttachment?: number): string => {
+// Neutralizes `</attachment>`/`</attachments>` sequences in a body so it can't
+// close its own wrapper early. Unrelated markup (e.g. `</div>`) is left intact.
+const escapeAttachmentBody = (text: string): string => {
+    return text.replace(/<(\/attachments?>)/gi, '&lt;$1');
+}
+
+// Formats a user message's text attachments into a delimited block to inline
+// into the turn's content. Returns '' when there are none. Size is bounded at
+// submit, so nothing is truncated here.
+export const formatAttachmentsForPrompt = (attachments: AttachmentData[]): string => {
     const textAttachments = attachments.filter((attachment) => attachment.kind === 'text');
     if (textAttachments.length === 0) {
         return '';
     }
 
     const blocks = textAttachments.map((attachment) => {
-        const text = maxBytesPerAttachment !== undefined
-            ? attachment.text.slice(0, maxBytesPerAttachment)
-            : attachment.text;
-        // Defense-in-depth: keep the filename on a single line, escape quotes,
-        // and cap its length so a crafted client can't break the tag or bloat
-        // the prompt (the client also sanitizes via sanitizeFilename).
+        const text = escapeAttachmentBody(attachment.text);
+        // Keep the filename on a single line and escape quotes so the body
+        // can't break out of the tag (the client also sanitizes via
+        // sanitizeFilename).
         const filename = attachment.filename
             .replace(/\s+/g, ' ')
-            .replace(/"/g, '&quot;')
-            .slice(0, ATTACHMENT_MAX_FILENAME_LENGTH);
+            .replace(/"/g, '&quot;');
         return `<attachment filename="${filename}" media-type="${attachment.mediaType}">\n${text}\n</attachment>`;
     });
 

From cb8181df5c0fbb16e429100d1a64a74e472634c2 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 15:43:55 -0700
Subject: [PATCH 19/19] pass attachments through the login/upgrade redirect

---
 .../features/chat/components/chatBox/chatBox.tsx    | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/packages/web/src/features/chat/components/chatBox/chatBox.tsx b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
index 1e6bac099..1bd33235f 100644
--- a/packages/web/src/features/chat/components/chatBox/chatBox.tsx
+++ b/packages/web/src/features/chat/components/chatBox/chatBox.tsx
@@ -282,7 +282,7 @@ const ChatBoxComponent = ({
         if (requiresLogin) {
             sessionStorage.setItem(
                 PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY,
-                JSON.stringify({ pathname, children: editor.children }),
+                JSON.stringify({ pathname, children: editor.children, attachments: attachments.map(toAttachmentData) }),
             );
             captureEvent('wa_askgh_login_wall_prompted', {});
             setIsLoginDialogOpen(true);
@@ -292,7 +292,7 @@ const ChatBoxComponent = ({
         if (requiresUpgrade) {
             sessionStorage.setItem(
                 PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY,
-                JSON.stringify({ pathname, children: editor.children }),
+                JSON.stringify({ pathname, children: editor.children, attachments: attachments.map(toAttachmentData) }),
             );
             setIsUpsellDialogOpen(true);
             return;
@@ -328,14 +328,17 @@ const ChatBoxComponent = ({
         }
 
         try {
-            const { pathname: storedPathname, children } = JSON.parse(stored) as { pathname: string; children: Descendant[] };
+            const { pathname: storedPathname, children, attachments: storedAttachments = [] } = JSON.parse(stored) as {
+                pathname: string;
+                children: Descendant[];
+                attachments?: AttachmentData[];
+            };
             if (storedPathname !== pathname) {
                 return;
             }
 
             sessionStorage.removeItem(PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY);
-            // Attachments are not persisted across the login/upgrade redirect.
-            _onSubmit(children, editor, []);
+            _onSubmit(children, editor, storedAttachments);
         } catch (error) {
             console.error('Failed to restore pending chat submission:', error);
             sessionStorage.removeItem(PENDING_CHAT_SUBMISSION_SESSION_STORAGE_KEY);