Skip to content

Commit e67acc1

Browse files
committed
fix(omni): use input_audio instead of audio_url
Fixes #54
1 parent a90a35d commit e67acc1

3 files changed

Lines changed: 188 additions & 8 deletions

File tree

packages/cli/src/commands/omni/chat.ts

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { writeFileSync } from "fs";
2+
import { extname } from "path";
23
import {
34
defineCommand,
45
request,
56
chatEndpoint,
67
parseSSE,
78
detectOutputFormat,
9+
BailianError,
10+
ExitCode,
811
type Config,
912
type GlobalFlags,
1013
type ChatMessage,
@@ -20,6 +23,46 @@ import { resolveOutputDir, resolveCredential } from "bailian-cli-core";
2023

2124
const OMNI_VOICES = ["Chelsie", "Cherry", "Ethan", "Serena", "Tina"];
2225

26+
/**
27+
* Extension to input audio format.
28+
*/
29+
const OMNI_INPUT_AUDIO_EXT: Record<string, string> = {
30+
wav: "wav",
31+
mp3: "mp3",
32+
amr: "amr",
33+
aac: "aac",
34+
m4a: "aac",
35+
ogg: "ogg",
36+
"3gp": "3gp",
37+
"3gpp": "3gpp",
38+
};
39+
40+
const audioExts = Object.keys(OMNI_INPUT_AUDIO_EXT);
41+
42+
/**
43+
* Infer the input audio format from the source URL or local file path.
44+
*/
45+
function inferInputAudioFormat(source: string): string {
46+
const pathPart = source.split("?")[0].split("#")[0];
47+
const ext = extname(pathPart).slice(1).toLowerCase();
48+
if (!ext) {
49+
throw new BailianError(
50+
`Cannot infer audio format from "${source}". ` +
51+
`Use a file/URL whose path ends with: ${audioExts.join(", ")}.`,
52+
ExitCode.USAGE,
53+
);
54+
}
55+
const format = OMNI_INPUT_AUDIO_EXT[ext];
56+
if (!format) {
57+
throw new BailianError(
58+
`Unsupported audio extension ".${ext}" for "${source}". ` +
59+
`Supported extensions: ${audioExts.join(", ")}.`,
60+
ExitCode.USAGE,
61+
);
62+
}
63+
return format;
64+
}
65+
2366
/**
2467
* Build a standard WAV file header for PCM 16-bit mono 24kHz audio.
2568
*/
@@ -55,7 +98,11 @@ export default defineCommand({
5598
{ flag: "--model <model>", description: "Model ID (default: qwen3.5-omni-plus)" },
5699
{ flag: "--system <text>", description: "System prompt" },
57100
{ flag: "--image <url>", description: "Image URL or local file (repeatable)", type: "array" },
58-
{ flag: "--audio <url>", description: "Audio URL or local file (repeatable)", type: "array" },
101+
{
102+
flag: "--audio <url>",
103+
description: "Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.3gp/.3gpp)",
104+
type: "array",
105+
},
59106
{
60107
flag: "--video <url>",
61108
description: "Video file URL / local path, or comma-separated frame URLs",
@@ -138,7 +185,7 @@ export default defineCommand({
138185

139186
// Auto-upload local files
140187
const imageUrls: string[] = [];
141-
const audioUrls: string[] = [];
188+
const audioInputs: Array<{ source: string; data: string }> = [];
142189
const videoUrls: string[] = [];
143190

144191
const needsResolve =
@@ -151,7 +198,7 @@ export default defineCommand({
151198
}
152199
for (const u of rawAudioUrls) {
153200
const resolved = await resolveFileUrl(u, credential.token, model);
154-
audioUrls.push(resolved);
201+
audioInputs.push({ source: u, data: resolved });
155202
}
156203
for (const u of rawVideoUrls) {
157204
// Detect: comma-separated = frame list, otherwise single video URL/file
@@ -173,7 +220,7 @@ export default defineCommand({
173220
}
174221
}
175222

176-
if (imageUrls.length > 0 || audioUrls.length > 0 || videoUrls.length > 0) {
223+
if (imageUrls.length > 0 || audioInputs.length > 0 || videoUrls.length > 0) {
177224
// Find last user message and convert to multimodal content array
178225
for (let i = allMessages.length - 1; i >= 0; i--) {
179226
if (allMessages[i].role === "user") {
@@ -192,9 +239,11 @@ export default defineCommand({
192239
contentArray.push({ type: "image_url", image_url: { url } });
193240
}
194241

195-
// Add audio URLs
196-
for (const url of audioUrls) {
197-
contentArray.push({ type: "audio_url", audio_url: { url } });
242+
for (const { source, data } of audioInputs) {
243+
contentArray.push({
244+
type: "input_audio",
245+
input_audio: { data, format: inferInputAudioFormat(source) },
246+
});
198247
}
199248

200249
// Add video URLs: frame:xxx are frame list items, others are direct video URLs
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import { describe, expect, test } from "vite-plus/test";
2+
import { join } from "node:path";
3+
import {
4+
e2eLabelFromMetaUrl,
5+
isBailianE2EMediaEnabled,
6+
isDashScopeE2EReady,
7+
makeE2eOutputDir,
8+
parseStdoutJson,
9+
runCli,
10+
} from "./helpers.ts";
11+
12+
describe("e2e: omni", () => {
13+
test("omni --help 正常退出", async () => {
14+
const { stderr, exitCode } = await runCli(["omni", "--help"]);
15+
expect(exitCode, stderr).toBe(0);
16+
expect(stderr).toMatch(/omni|--message|--audio|text-only/i);
17+
});
18+
});
19+
20+
describe.skipIf(!isBailianE2EMediaEnabled() || !isDashScopeE2EReady())(
21+
"e2e: omni(DashScope 媒体)",
22+
() => {
23+
test("omni 缺少 --message 时打印子命令帮助并退出 (0)", async () => {
24+
const { stderr, exitCode } = await runCli([
25+
"omni",
26+
"--model",
27+
"qwen3.5-omni-flash",
28+
"--non-interactive",
29+
]);
30+
expect(exitCode).toBe(0);
31+
expect(stderr).toMatch(/--message|Usage:/i);
32+
});
33+
34+
test("omni --audio 无法识别扩展名时退出为用法错误 (2)", async () => {
35+
const { stderr, exitCode } = await runCli([
36+
"omni",
37+
"--model",
38+
"qwen3.5-omni-flash",
39+
"--audio",
40+
"https://example.com/sample.flac",
41+
"--text-only",
42+
"--message",
43+
"这段音频在说什么?",
44+
"--non-interactive",
45+
]);
46+
expect(exitCode).toBe(2);
47+
expect(stderr).toMatch(/Unsupported audio extension|Cannot infer audio format/i);
48+
});
49+
50+
test("omni --dry-run --audio 构造 input_audio 而非 audio_url", async () => {
51+
const { stdout, stderr, exitCode } = await runCli([
52+
"omni",
53+
"--dry-run",
54+
"--model",
55+
"qwen3.5-omni-flash",
56+
"--audio",
57+
"https://example.com/sample.wav",
58+
"--text-only",
59+
"--message",
60+
"这段音频在说什么?",
61+
"--non-interactive",
62+
"--output",
63+
"json",
64+
]);
65+
expect(exitCode, stderr).toBe(0);
66+
const data = parseStdoutJson<{
67+
request?: {
68+
messages?: Array<{
69+
content?: Array<{
70+
type?: string;
71+
audio_url?: unknown;
72+
input_audio?: { data?: string; format?: string };
73+
}>;
74+
}>;
75+
};
76+
}>(stdout);
77+
const parts = data.request?.messages?.flatMap((m) =>
78+
Array.isArray(m.content) ? m.content : [],
79+
);
80+
const audioPart = parts?.find((p) => p.type === "input_audio" || p.type === "audio_url");
81+
expect(audioPart?.type).toBe("input_audio");
82+
expect(audioPart?.audio_url).toBeUndefined();
83+
expect(audioPart?.input_audio?.data).toBe("https://example.com/sample.wav");
84+
expect(audioPart?.input_audio?.format).toBe("wav");
85+
});
86+
87+
test("【qwen3.5-omni-flash】本地音频理解", async () => {
88+
const outDir = makeE2eOutputDir(e2eLabelFromMetaUrl(import.meta.url));
89+
const clipText = "端到端Omni音频测试";
90+
const clipWav = join(outDir, "e2e-omni-input.wav");
91+
92+
const syn = await runCli([
93+
"speech",
94+
"synthesize",
95+
"--model",
96+
"cosyvoice-v3-flash",
97+
"--voice",
98+
"longxiaochun_v3",
99+
"--text",
100+
clipText,
101+
"--format",
102+
"wav",
103+
"--out",
104+
clipWav,
105+
"--non-interactive",
106+
"--output",
107+
"json",
108+
]);
109+
expect(syn.exitCode, syn.stderr).toBe(0);
110+
111+
const omni = await runCli([
112+
"omni",
113+
"--model",
114+
"qwen3.5-omni-flash",
115+
"--audio",
116+
clipWav,
117+
"--text-only",
118+
"--system",
119+
"请逐字转写用户提供的音频内容,不要添加解释。",
120+
"--message",
121+
"请转写这段音频。",
122+
"--non-interactive",
123+
"--output",
124+
"json",
125+
]);
126+
expect(omni.exitCode, omni.stderr).toBe(0);
127+
const body = parseStdoutJson<{ content?: string }>(omni.stdout);
128+
expect(body.content?.replace(/\s/g, "")).toMatch(/Omni/);
129+
}, 180_000);
130+
},
131+
);

skills/bailian-cli/reference/omni.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Index: [index.md](index.md)
2929
| `--model <model>` | string | no | Model ID (default: qwen3.5-omni-plus) |
3030
| `--system <text>` | string | no | System prompt |
3131
| `--image <url>` | array | no | Image URL or local file (repeatable) |
32-
| `--audio <url>` | array | no | Audio URL or local file (repeatable) |
32+
| `--audio <url>` | array | no | Audio URL or local file (.wav/.mp3/.amr/.aac/.m4a/.3gp/.3gpp) |
3333
| `--video <url>` | array | no | Video file URL / local path, or comma-separated frame URLs |
3434
| `--voice <voice>` | string | no | Output voice (default: Cherry). Options: Chelsie, Cherry, Ethan, Serena, Tina |
3535
| `--audio-format <fmt>` | string | no | Audio output format (default: wav) |

0 commit comments

Comments
 (0)