Skip to content

Commit b8292ae

Browse files
authored
Merge pull request #105 from raylanlin/fix/speech-subtitles
fix(speech): correctly enable and download subtitles for TTS
2 parents a8d47ef + 7aea012 commit b8292ae

5 files changed

Lines changed: 103 additions & 14 deletions

File tree

skill/SKILL.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ mmx speech synthesize --text <text> [flags]
178178
| `--bitrate <bps>` | number | Bitrate (default: 128000) |
179179
| `--channels <n>` | number | Audio channels (default: 1) |
180180
| `--language <code>` | string | Language boost |
181-
| `--subtitles` | boolean | Include subtitle timing data |
181+
| `--subtitles` | boolean | Download and save subtitles as `.srt` file (alongside `--out` audio file). API must support subtitles for the selected model.
182182
| `--pronunciation <from/to>` | string, repeatable | Custom pronunciation |
183183
| `--sound-effect <effect>` | string | Add sound effect |
184184
| `--out <path>` | string | Save audio to file |
@@ -188,6 +188,9 @@ mmx speech synthesize --text <text> [flags]
188188
mmx speech synthesize --text "Hello world" --out hello.mp3 --quiet
189189
# stdout: hello.mp3
190190

191+
mmx speech synthesize --text "Hello" --subtitles --out hello.mp3
192+
# saves hello.mp3 + hello.srt (SRT subtitle file)
193+
191194
echo "Breaking news." | mmx speech synthesize --text-file - --out news.mp3
192195
```
193196

src/commands/speech/synthesize.ts

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { speechEndpoint } from '../../client/endpoints';
66
import { parseSSE } from '../../client/stream';
77
import { detectOutputFormat, formatOutput } from '../../output/formatter';
88
import { saveAudioOutput } from '../../output/audio';
9+
import { writeFileSync } from 'fs';
910
import { readTextFromPathOrStdin } from '../../utils/fs';
1011
import type { Config } from '../../config/schema';
1112
import type { GlobalFlags } from '../../types/flags';
@@ -37,6 +38,7 @@ export default defineCommand({
3738
examples: [
3839
'mmx speech synthesize --text "Hello, world!"',
3940
'mmx speech synthesize --text "Hello, world!" --out hello.mp3',
41+
'mmx speech synthesize --text "Hello" --subtitles --out hello.mp3',
4042
'echo "Breaking news." | mmx speech synthesize --text-file - --out news.mp3',
4143
'mmx speech synthesize --text "Stream" --stream | mpv --no-terminal -',
4244
],
@@ -85,7 +87,7 @@ export default defineCommand({
8587
};
8688

8789
if (flags.language) body.language_boost = flags.language as string;
88-
if (flags.subtitles) body.subtitle = true;
90+
if (flags.subtitles) body.subtitle_enable = true; // Correct API parameter name
8991

9092
if (flags.pronunciation) {
9193
body.pronunciation_dict = (flags.pronunciation as string[]).map(p => {
@@ -122,5 +124,52 @@ export default defineCommand({
122124

123125
if (!config.quiet) process.stderr.write(`[Model: ${model}]\n`);
124126
saveAudioOutput(response, outPath, format, config.quiet);
127+
128+
// Download and save subtitle file when --subtitles is requested
129+
if (flags.subtitles && response.data.subtitle_file) {
130+
try {
131+
// Download the subtitle JSON file from the URL
132+
const subtitleRes = await fetch(response.data.subtitle_file);
133+
if (!subtitleRes.ok) {
134+
throw new CLIError(`Failed to download subtitle file: ${subtitleRes.status}`, ExitCode.GENERAL);
135+
}
136+
// API returns a flat array, not { subtitles: [...] }
137+
const subtitleArray = await subtitleRes.json() as Array<{ text: string; time_begin: number; time_end: number }>;
138+
139+
if (subtitleArray?.length) {
140+
// Convert to SRT format (API returns time in milliseconds)
141+
const subtitlePath = outPath.replace(/\.[^.]+$/, '') + '.srt';
142+
const srtContent = subtitleArray
143+
.map((s, i) => {
144+
// API already returns milliseconds, use directly
145+
const fmt = (ms: number) => {
146+
const h = String(Math.floor(ms / 3600000)).padStart(2, '0');
147+
const m = String(Math.floor((ms % 3600000) / 60000)).padStart(2, '0');
148+
const sec = String(Math.floor((ms % 60000) / 1000)).padStart(2, '0');
149+
const mil = String(Math.round(ms % 1000)).padStart(3, '0');
150+
return `${h}:${m}:${sec},${mil}`;
151+
};
152+
return `${i + 1}\n${fmt(s.time_begin)} --> ${fmt(s.time_end)}\n${s.text}`;
153+
})
154+
.join('\n\n');
155+
writeFileSync(subtitlePath, srtContent, 'utf-8');
156+
if (!config.quiet) {
157+
console.log(formatOutput({ subtitles: subtitlePath }, format));
158+
} else {
159+
console.log(subtitlePath);
160+
}
161+
}
162+
} catch (err) {
163+
// Non-fatal: log warning but don't fail the whole synthesis
164+
if (!config.quiet) {
165+
process.stderr.write(`Warning: failed to download subtitles: ${(err as Error).message}\n`);
166+
}
167+
}
168+
} else if (flags.subtitles && !response.data.subtitle_file) {
169+
// Warn if --subtitles was requested but API didn't return subtitle_file
170+
if (!config.quiet) {
171+
process.stderr.write(`Warning: subtitles requested but not returned by API\n`);
172+
}
173+
}
125174
},
126175
});

src/types/api.ts

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -108,15 +108,15 @@ export interface SpeechRequest {
108108
pronunciation_dict?: Array<{ tone: string; text: string }>;
109109
output_format?: 'url' | 'hex';
110110
stream?: boolean;
111-
subtitle?: boolean;
111+
subtitle_enable?: boolean; // Correct API parameter name (not 'subtitle')
112112
}
113113

114114
export interface SpeechResponse {
115115
base_resp: BaseResp;
116116
data: {
117117
audio?: string; // hex-encoded audio data
118118
audio_url?: string;
119-
subtitle_info?: SubtitleInfo;
119+
subtitle_file?: string; // URL to download subtitle JSON file (when subtitle_enable=true)
120120
status: number;
121121
};
122122
extra_info?: {
@@ -129,14 +129,6 @@ export interface SpeechResponse {
129129
};
130130
}
131131

132-
export interface SubtitleInfo {
133-
subtitles: Array<{
134-
text: string;
135-
start_time: number;
136-
end_time: number;
137-
}>;
138-
}
139-
140132
// ---- Voice List ----
141133

142134
export interface SystemVoiceInfo {

test/auth/timeout-fix.test.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ describe('refreshAccessToken: timeout and error handling', () => {
124124
// We test the real function against a mock server via a wrapper
125125
// that overrides the fetch to hit our local server instead.
126126
const origFetch = globalThis.fetch;
127-
globalThis.fetch = async (input: RequestInfo | URL, init?: RequestInit) => {
127+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
128+
(globalThis as any).fetch = async (input: RequestInfo | URL, init?: RequestInit) => {
128129
const url = typeof input === 'string' ? input : input.toString();
129130
if (url.includes('oauth/token')) {
130131
return origFetch(`${server.url}/v1/oauth/token`, init);
@@ -156,7 +157,8 @@ describe('refreshAccessToken: timeout and error handling', () => {
156157

157158
const mod = await import('../../src/auth/refresh');
158159
const origFetch = globalThis.fetch;
159-
globalThis.fetch = async (input: RequestInfo | URL, init?: RequestInit) => {
160+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
161+
(globalThis as any).fetch = async (input: RequestInfo | URL, init?: RequestInit) => {
160162
const url = typeof input === 'string' ? input : input.toString();
161163
if (url.includes('oauth/token')) {
162164
return origFetch(`${server.url}/v1/oauth/token`, init);

test/commands/speech/synthesize.test.ts

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,4 +159,47 @@ describe('speech synthesize command', () => {
159159
console.log = originalLog;
160160
}
161161
});
162+
163+
it('--subtitles sets subtitle_enable in dry-run output', async () => {
164+
const config = {
165+
apiKey: 'test-key',
166+
region: 'global' as const,
167+
baseUrl: 'https://api.mmx.io',
168+
output: 'json' as const,
169+
timeout: 10,
170+
verbose: false,
171+
quiet: false,
172+
noColor: true,
173+
yes: false,
174+
dryRun: true,
175+
nonInteractive: true,
176+
async: false,
177+
};
178+
179+
const originalLog = console.log;
180+
let output = '';
181+
console.log = (msg: string) => { output += msg; };
182+
183+
try {
184+
await synthesizeCommand.execute(config, {
185+
text: 'Hello',
186+
subtitles: true,
187+
quiet: false,
188+
verbose: false,
189+
noColor: true,
190+
yes: false,
191+
dryRun: true,
192+
help: false,
193+
nonInteractive: true,
194+
async: false,
195+
});
196+
197+
const parsed = JSON.parse(output);
198+
expect(parsed.request.subtitle_enable).toBe(true);
199+
// Verify the old incorrect parameter name is NOT used
200+
expect(parsed.request.subtitle).toBeUndefined();
201+
} finally {
202+
console.log = originalLog;
203+
}
204+
});
162205
});

0 commit comments

Comments
 (0)