fix(omni): use input_audio instead of audio_url

hopelynd · hopelynd · commit e67acc118f85 · 2026-06-12T17:34:41.000+08:00
Fixes #54
diff --git a/packages/cli/src/commands/omni/chat.ts b/packages/cli/src/commands/omni/chat.ts
@@ -1,10 +1,13 @@
 import { writeFileSync } from "fs";
+import { extname } from "path";
 import {
   defineCommand,
   request,
   chatEndpoint,
   parseSSE,
   detectOutputFormat,
+  BailianError,
+  ExitCode,
   type Config,
   type GlobalFlags,
   type ChatMessage,
@@ -20,6 +23,46 @@ import { resolveOutputDir, resolveCredential } from "bailian-cli-core";
 
 const OMNI_VOICES = ["Chelsie", "Cherry", "Ethan", "Serena", "Tina"];
 
+/**
+ * Extension to input audio format.
+ */
+const OMNI_INPUT_AUDIO_EXT: Record<string, string> = {
+  wav: "wav",
+  mp3: "mp3",
+  amr: "amr",
+  aac: "aac",
+  m4a: "aac",
+  ogg: "ogg",
+  "3gp": "3gp",
+  "3gpp": "3gpp",
+};
+
+const audioExts = Object.keys(OMNI_INPUT_AUDIO_EXT);
+
+/**
+ * Infer the input audio format from the source URL or local file path.
+ */
+function inferInputAudioFormat(source: string): string {
+  const pathPart = source.split("?")[0].split("#")[0];
+  const ext = extname(pathPart).slice(1).toLowerCase();
+  if (!ext) {
+    throw new BailianError(
+      `Cannot infer audio format from "${source}". ` +
+        `Use a file/URL whose path ends with: ${audioExts.join(", ")}.`,
+      ExitCode.USAGE,
+    );
+  }
+  const format = OMNI_INPUT_AUDIO_EXT[ext];
+  if (!format) {
+    throw new BailianError(
+      `Unsupported audio extension ".${ext}" for "${source}". ` +
+        `Supported extensions: ${audioExts.join(", ")}.`,
+      ExitCode.USAGE,
+    );
+  }
+  return format;
+}
+
 /**
  * Build a standard WAV file header for PCM 16-bit mono 24kHz audio.
  */
@@ -55,7 +98,11 @@ export default defineCommand({
     { flag: "--model <model>", description: "Model ID (default: qwen3.5-omni-plus)" },
     { flag: "--system <text>", description: "System prompt" },
     { flag: "--image <url>", description: "Image URL or local file (repeatable)", type: "array" },
-    { flag: "--audio <url>", description: "Audio URL or local file (repeatable)", type: "array" },
+    {
+      flag: "--audio <url>",
+      description: "Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.3gp/.3gpp)",
+      type: "array",
+    },
     {
       flag: "--video <url>",
       description: "Video file URL / local path, or comma-separated frame URLs",
@@ -138,7 +185,7 @@ export default defineCommand({
 
     // Auto-upload local files
     const imageUrls: string[] = [];
-    const audioUrls: string[] = [];
+    const audioInputs: Array<{ source: string; data: string }> = [];
     const videoUrls: string[] = [];
 
     const needsResolve =
@@ -151,7 +198,7 @@ export default defineCommand({
       }
       for (const u of rawAudioUrls) {
         const resolved = await resolveFileUrl(u, credential.token, model);
-        audioUrls.push(resolved);
+        audioInputs.push({ source: u, data: resolved });
       }
       for (const u of rawVideoUrls) {
         // Detect: comma-separated = frame list, otherwise single video URL/file
@@ -173,7 +220,7 @@ export default defineCommand({
       }
     }
 
-    if (imageUrls.length > 0 || audioUrls.length > 0 || videoUrls.length > 0) {
+    if (imageUrls.length > 0 || audioInputs.length > 0 || videoUrls.length > 0) {
       // Find last user message and convert to multimodal content array
       for (let i = allMessages.length - 1; i >= 0; i--) {
         if (allMessages[i].role === "user") {
@@ -192,9 +239,11 @@ export default defineCommand({
             contentArray.push({ type: "image_url", image_url: { url } });
           }
 
-          // Add audio URLs
-          for (const url of audioUrls) {
-            contentArray.push({ type: "audio_url", audio_url: { url } });
+          for (const { source, data } of audioInputs) {
+            contentArray.push({
+              type: "input_audio",
+              input_audio: { data, format: inferInputAudioFormat(source) },
+            });
           }
 
           // Add video URLs: frame:xxx are frame list items, others are direct video URLs
diff --git a/packages/cli/tests/e2e/omni.e2e.test.ts b/packages/cli/tests/e2e/omni.e2e.test.ts
@@ -0,0 +1,131 @@
+import { describe, expect, test } from "vite-plus/test";
+import { join } from "node:path";
+import {
+  e2eLabelFromMetaUrl,
+  isBailianE2EMediaEnabled,
+  isDashScopeE2EReady,
+  makeE2eOutputDir,
+  parseStdoutJson,
+  runCli,
+} from "./helpers.ts";
+
+describe("e2e: omni", () => {
+  test("omni --help 正常退出", async () => {
+    const { stderr, exitCode } = await runCli(["omni", "--help"]);
+    expect(exitCode, stderr).toBe(0);
+    expect(stderr).toMatch(/omni|--message|--audio|text-only/i);
+  });
+});
+
+describe.skipIf(!isBailianE2EMediaEnabled() || !isDashScopeE2EReady())(
+  "e2e: omni（DashScope 媒体）",
+  () => {
+    test("omni 缺少 --message 时打印子命令帮助并退出 (0)", async () => {
+      const { stderr, exitCode } = await runCli([
+        "omni",
+        "--model",
+        "qwen3.5-omni-flash",
+        "--non-interactive",
+      ]);
+      expect(exitCode).toBe(0);
+      expect(stderr).toMatch(/--message|Usage:/i);
+    });
+
+    test("omni --audio 无法识别扩展名时退出为用法错误 (2)", async () => {
+      const { stderr, exitCode } = await runCli([
+        "omni",
+        "--model",
+        "qwen3.5-omni-flash",
+        "--audio",
+        "https://example.com/sample.flac",
+        "--text-only",
+        "--message",
+        "这段音频在说什么？",
+        "--non-interactive",
+      ]);
+      expect(exitCode).toBe(2);
+      expect(stderr).toMatch(/Unsupported audio extension|Cannot infer audio format/i);
+    });
+
+    test("omni --dry-run --audio 构造 input_audio 而非 audio_url", async () => {
+      const { stdout, stderr, exitCode } = await runCli([
+        "omni",
+        "--dry-run",
+        "--model",
+        "qwen3.5-omni-flash",
+        "--audio",
+        "https://example.com/sample.wav",
+        "--text-only",
+        "--message",
+        "这段音频在说什么？",
+        "--non-interactive",
+        "--output",
+        "json",
+      ]);
+      expect(exitCode, stderr).toBe(0);
+      const data = parseStdoutJson<{
+        request?: {
+          messages?: Array<{
+            content?: Array<{
+              type?: string;
+              audio_url?: unknown;
+              input_audio?: { data?: string; format?: string };
+            }>;
+          }>;
+        };
+      }>(stdout);
+      const parts = data.request?.messages?.flatMap((m) =>
+        Array.isArray(m.content) ? m.content : [],
+      );
+      const audioPart = parts?.find((p) => p.type === "input_audio" || p.type === "audio_url");
+      expect(audioPart?.type).toBe("input_audio");
+      expect(audioPart?.audio_url).toBeUndefined();
+      expect(audioPart?.input_audio?.data).toBe("https://example.com/sample.wav");
+      expect(audioPart?.input_audio?.format).toBe("wav");
+    });
+
+    test("【qwen3.5-omni-flash】本地音频理解", async () => {
+      const outDir = makeE2eOutputDir(e2eLabelFromMetaUrl(import.meta.url));
+      const clipText = "端到端Omni音频测试";
+      const clipWav = join(outDir, "e2e-omni-input.wav");
+
+      const syn = await runCli([
+        "speech",
+        "synthesize",
+        "--model",
+        "cosyvoice-v3-flash",
+        "--voice",
+        "longxiaochun_v3",
+        "--text",
+        clipText,
+        "--format",
+        "wav",
+        "--out",
+        clipWav,
+        "--non-interactive",
+        "--output",
+        "json",
+      ]);
+      expect(syn.exitCode, syn.stderr).toBe(0);
+
+      const omni = await runCli([
+        "omni",
+        "--model",
+        "qwen3.5-omni-flash",
+        "--audio",
+        clipWav,
+        "--text-only",
+        "--system",
+        "请逐字转写用户提供的音频内容，不要添加解释。",
+        "--message",
+        "请转写这段音频。",
+        "--non-interactive",
+        "--output",
+        "json",
+      ]);
+      expect(omni.exitCode, omni.stderr).toBe(0);
+      const body = parseStdoutJson<{ content?: string }>(omni.stdout);
+      expect(body.content?.replace(/\s/g, "")).toMatch(/端到端Omni音频测试/);
+    }, 180_000);
+  },
+);
diff --git a/skills/bailian-cli/reference/omni.md b/skills/bailian-cli/reference/omni.md
@@ -29,7 +29,7 @@ Index: [index.md](index.md)
 | `--model <model>`      | string  | no       | Model ID (default: qwen3.5-omni-plus)                                         |
 | `--system <text>`      | string  | no       | System prompt                                                                 |
 | `--image <url>`        | array   | no       | Image URL or local file (repeatable)                                          |
-| `--audio <url>`        | array   | no       | Audio URL or local file (repeatable)                                          |
+| `--audio <url>`        | array   | no       | Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.3gp/.3gpp)                 |
 | `--video <url>`        | array   | no       | Video file URL / local path, or comma-separated frame URLs                    |
 | `--voice <voice>`      | string  | no       | Output voice (default: Cherry). Options: Chelsie, Cherry, Ethan, Serena, Tina |
 | `--audio-format <fmt>` | string  | no       | Audio output format (default: wav)                                            |