Skip to content

Commit 18b05d2

Browse files
committed
fix(execution,connectors): offload large function inputs; harden KB connector size limits
Addresses a class of 10 MB limit failures: - executor/variables: offload over-budget function block-output context values to durable large-value refs (lazy `sim.values.read`) so JS function blocks can merge medium files without exceeding the 10 MB inter-block request-body cap. - connectors: stream downloads via `readBodyWithLimit` (memory-safe), and surface oversized files as visible `failed` KB documents instead of silently dropping them — listing-time for github/s3/dropbox/onedrive/sharepoint, fetch-time for gitlab/azure/google-drive via a shared `ConnectorFileTooLargeError`. Raise the per-file cap from a hardcoded 10 MB to the canonical 100 MB KB document limit (`CONNECTOR_MAX_FILE_BYTES`), except Google Drive's export path (Google's hard 10 MB export-API limit). - sync-engine: `classifyExternalDoc` + bulk `skipDocuments` (failed rows with a reason, excluded from retry), byte-bounded batch concurrency to cap peak worker memory at the raised cap, and a `metadata.fileSize ?? size` fallback.
1 parent cbd3d22 commit 18b05d2

15 files changed

Lines changed: 1057 additions & 106 deletions

File tree

apps/sim/connectors/azure-devops/azure-devops.ts

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,15 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
33
import { AzureDevOpsIcon } from '@/components/icons'
44
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
55
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
6-
import { htmlToPlainText, joinTagArray, parseTagDate, readBodyWithLimit } from '@/connectors/utils'
6+
import {
7+
CONNECTOR_MAX_FILE_BYTES,
8+
htmlToPlainText,
9+
joinTagArray,
10+
markSkipped,
11+
parseTagDate,
12+
readBodyWithLimit,
13+
sizeLimitSkipReason,
14+
} from '@/connectors/utils'
715

816
const logger = createLogger('AzureDevOpsConnector')
917

@@ -30,7 +38,7 @@ const FILE_BATCH_SIZE = 100
3038
* and aborts (returning null) the moment the cap is exceeded. Larger files are
3139
* skipped without being fully buffered.
3240
*/
33-
const MAX_FILE_SIZE = 10 * 1024 * 1024
41+
const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
3442
/** Bytes sniffed for a NUL byte when detecting binary files (matches git's heuristic). */
3543
const BINARY_SNIFF_BYTES = 8000
3644
/**
@@ -1090,7 +1098,27 @@ async function getFileDocument(
10901098
const buffer = await readBodyWithLimit(contentResponse, MAX_FILE_SIZE)
10911099
if (buffer === null) {
10921100
logger.info('Skipping oversized Azure DevOps file', { path })
1093-
return null
1101+
const skippedTitle = path.split('/').filter(Boolean).pop() || path
1102+
return markSkipped(
1103+
{
1104+
externalId,
1105+
title: skippedTitle,
1106+
content: '',
1107+
mimeType: 'text/plain',
1108+
sourceUrl: buildFileSourceUrl(repo?.webUrl, branch, path),
1109+
contentHash: buildFileContentHash(repoId, item.objectId),
1110+
metadata: {
1111+
kind: 'file',
1112+
organization,
1113+
project,
1114+
repository: repo?.name ?? '',
1115+
repositoryId: repoId,
1116+
branch,
1117+
path,
1118+
},
1119+
},
1120+
sizeLimitSkipReason(MAX_FILE_SIZE)
1121+
)
10941122
}
10951123
if (isBinaryBuffer(buffer)) {
10961124
logger.info('Skipping binary Azure DevOps file', { path })

apps/sim/connectors/dropbox/dropbox.ts

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
33
import { DropboxIcon } from '@/components/icons'
44
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
55
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
6-
import { htmlToPlainText, parseTagDate } from '@/connectors/utils'
6+
import {
7+
CONNECTOR_MAX_FILE_BYTES,
8+
ConnectorFileTooLargeError,
9+
htmlToPlainText,
10+
markSkipped,
11+
parseTagDate,
12+
readBodyWithLimit,
13+
sizeLimitSkipReason,
14+
stubOrSkipBySize,
15+
} from '@/connectors/utils'
716

817
const logger = createLogger('DropboxConnector')
918

@@ -23,7 +32,7 @@ const SUPPORTED_EXTENSIONS = new Set([
2332
'.tsv',
2433
])
2534

26-
const MAX_FILE_SIZE = 10 * 1024 * 1024 // 10 MB
35+
const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
2736

2837
interface DropboxFileEntry {
2938
'.tag': 'file' | 'folder' | 'deleted'
@@ -44,16 +53,18 @@ interface DropboxListFolderResponse {
4453
has_more: boolean
4554
}
4655

47-
function isSupportedFile(entry: DropboxFileEntry): boolean {
48-
if (entry['.tag'] !== 'file') return false
49-
if (entry.is_downloadable === false) return false
50-
if (entry.size && entry.size > MAX_FILE_SIZE) return false
51-
52-
const name = entry.name.toLowerCase()
53-
const dotIndex = name.lastIndexOf('.')
56+
function hasSupportedExtension(name: string): boolean {
57+
const lower = name.toLowerCase()
58+
const dotIndex = lower.lastIndexOf('.')
5459
if (dotIndex === -1) return false
60+
return SUPPORTED_EXTENSIONS.has(lower.slice(dotIndex))
61+
}
5562

56-
return SUPPORTED_EXTENSIONS.has(name.slice(dotIndex))
63+
/** A downloadable file with a supported extension, regardless of size. */
64+
function isDownloadableFile(entry: DropboxFileEntry): boolean {
65+
return (
66+
entry['.tag'] === 'file' && entry.is_downloadable !== false && hasSupportedExtension(entry.name)
67+
)
5768
}
5869

5970
async function downloadFileContent(accessToken: string, filePath: string): Promise<string> {
@@ -69,7 +80,15 @@ async function downloadFileContent(accessToken: string, filePath: string): Promi
6980
throw new Error(`Failed to download file ${filePath}: ${response.status}`)
7081
}
7182

72-
const text = await response.text()
83+
// Stream with a hard byte cap so a file whose listing metadata under-reported
84+
// (or omitted) its size can never be fully buffered into memory. Oversize raises
85+
// so getDocument can surface it as a skipped (failed) row rather than dropping it.
86+
const buffer = await readBodyWithLimit(response, MAX_FILE_SIZE)
87+
if (!buffer) {
88+
throw new ConnectorFileTooLargeError(MAX_FILE_SIZE)
89+
}
90+
91+
const text = buffer.toString('utf8')
7392

7493
if (filePath.endsWith('.html') || filePath.endsWith('.htm')) {
7594
return htmlToPlainText(text)
@@ -190,12 +209,16 @@ export const dropboxConnector: ConnectorConfig = {
190209
data = await response.json()
191210
}
192211

193-
const supportedFiles = data.entries.filter(isSupportedFile)
212+
// Keep oversized files and surface them as skipped (failed) documents instead
213+
// of dropping them silently at listing time.
214+
const candidateFiles = data.entries.filter(isDownloadableFile)
194215

195216
const maxFiles = sourceConfig.maxFiles ? Number(sourceConfig.maxFiles) : 0
196217
const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0
197218

198-
let documents = supportedFiles.map(fileToStub)
219+
let documents = candidateFiles.map((entry) =>
220+
stubOrSkipBySize(fileToStub(entry), entry.size, MAX_FILE_SIZE)
221+
)
199222

200223
if (maxFiles > 0) {
201224
const remaining = maxFiles - previouslyFetched
@@ -238,12 +261,24 @@ export const dropboxConnector: ConnectorConfig = {
238261

239262
const entry = (await response.json()) as DropboxFileEntry
240263

241-
if (!isSupportedFile(entry)) return null
264+
if (!isDownloadableFile(entry)) return null
265+
266+
const stub = fileToStub(entry)
267+
if (entry.size && entry.size > MAX_FILE_SIZE) {
268+
return markSkipped(stub, sizeLimitSkipReason(MAX_FILE_SIZE))
269+
}
242270

243-
const content = await downloadFileContent(accessToken, entry.path_lower)
271+
let content: string
272+
try {
273+
content = await downloadFileContent(accessToken, entry.path_lower)
274+
} catch (error) {
275+
if (error instanceof ConnectorFileTooLargeError) {
276+
return markSkipped(stub, sizeLimitSkipReason(error.limitBytes))
277+
}
278+
throw error
279+
}
244280
if (!content.trim()) return null
245281

246-
const stub = fileToStub(entry)
247282
return { ...stub, content, contentDeferred: false }
248283
} catch (error) {
249284
logger.warn(`Failed to fetch document ${externalId}`, {

apps/sim/connectors/github/github.ts

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,20 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
33
import { GithubIcon } from '@/components/icons'
44
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
55
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
6-
import { parseTagDate } from '@/connectors/utils'
6+
import {
7+
CONNECTOR_MAX_FILE_BYTES,
8+
markSkipped,
9+
parseTagDate,
10+
sizeLimitSkipReason,
11+
stubOrSkipBySize,
12+
} from '@/connectors/utils'
713

814
const logger = createLogger('GitHubConnector')
915

1016
const GITHUB_API_URL = 'https://api.github.com'
1117
const BATCH_SIZE = 30
1218
const GIT_SHA_PREFIX = 'git-sha:'
13-
const MAX_FILE_SIZE = 10 * 1024 * 1024 // 10 MB
19+
const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
1420
const BINARY_SNIFF_BYTES = 8000
1521

1622
/**
@@ -245,11 +251,11 @@ export const githubConnector: ConnectorConfig = {
245251
} else {
246252
const tree = await fetchTree(accessToken, owner, repo, branch)
247253

248-
// Filter by path prefix, extensions, and size
254+
// Filter by path prefix and extensions. Oversized files are kept here and
255+
// surfaced as skipped (failed) documents at stub time so they stay visible.
249256
const filtered = tree.filter((item) => {
250257
if (pathPrefix && !item.path.startsWith(pathPrefix)) return false
251258
if (!matchesExtension(item.path, extSet)) return false
252-
if (typeof item.size === 'number' && item.size > MAX_FILE_SIZE) return false
253259
return true
254260
})
255261

@@ -271,7 +277,9 @@ export const githubConnector: ConnectorConfig = {
271277
batchSize: batch.length,
272278
})
273279

274-
const documents = batch.map((item) => treeItemToStub(owner, repo, branch, item))
280+
const documents = batch.map((item) =>
281+
stubOrSkipBySize(treeItemToStub(owner, repo, branch, item), item.size, MAX_FILE_SIZE)
282+
)
275283

276284
const nextOffset = offset + BATCH_SIZE
277285
const hasMore = nextOffset < capped.length
@@ -329,7 +337,24 @@ export const githubConnector: ConnectorConfig = {
329337
size,
330338
limit: MAX_FILE_SIZE,
331339
})
332-
return null
340+
return markSkipped(
341+
{
342+
externalId,
343+
title: path.split('/').pop() || path,
344+
content: '',
345+
mimeType: 'text/plain',
346+
sourceUrl: `https://github.com/${owner}/${repo}/blob/${branch.split('/').map(encodeURIComponent).join('/')}/${path.split('/').map(encodeURIComponent).join('/')}`,
347+
contentHash: `${GIT_SHA_PREFIX}${data.sha as string}`,
348+
metadata: {
349+
path,
350+
sha: data.sha as string,
351+
size,
352+
branch,
353+
repository: `${owner}/${repo}`,
354+
},
355+
},
356+
sizeLimitSkipReason(MAX_FILE_SIZE)
357+
)
333358
}
334359

335360
const rawContent = (data.content as string) || ''

apps/sim/connectors/gitlab/gitlab.ts

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,21 @@ import { isSameOrigin } from '@/lib/core/utils/validation'
66
import { secureFetchWithRetry } from '@/lib/knowledge/documents/secure-fetch.server'
77
import { VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
88
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
9-
import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
9+
import {
10+
CONNECTOR_MAX_FILE_BYTES,
11+
computeContentHash,
12+
joinTagArray,
13+
markSkipped,
14+
parseTagDate,
15+
sizeLimitSkipReason,
16+
} from '@/connectors/utils'
1017

1118
const logger = createLogger('GitLabConnector')
1219

1320
const DEFAULT_HOST = 'gitlab.com'
1421
const PAGE_SIZE = 100
1522
/** Max repository file size to index. Larger blobs are skipped. */
16-
const MAX_FILE_SIZE = 10 * 1024 * 1024
23+
const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
1724
/** Bytes sniffed for NUL when detecting binary files (matches git's heuristic). */
1825
const BINARY_SNIFF_BYTES = 8000
1926

@@ -324,9 +331,25 @@ function fileToDocument(
324331
const blobSha = file.blob_id?.trim()
325332
if (!blobSha) return null
326333

334+
const title = path.split('/').pop() || path
335+
const skippedForSize = (size: number): ExternalDocument => {
336+
logger.info('Skipping oversized GitLab file', { path, size })
337+
return markSkipped(
338+
{
339+
externalId: `${FILE_PREFIX}${path}`,
340+
title,
341+
content: '',
342+
mimeType: 'text/plain',
343+
sourceUrl: buildFileSourceUrl(apiBase, encodedProject, host, projectPath, ref, path),
344+
contentHash: buildFileContentHash(encodedProject, path, blobSha),
345+
metadata: { contentType: 'file', title, path, size },
346+
},
347+
sizeLimitSkipReason(MAX_FILE_SIZE)
348+
)
349+
}
350+
327351
if (typeof file.size === 'number' && file.size > MAX_FILE_SIZE) {
328-
logger.info('Skipping oversized GitLab file', { path, size: file.size })
329-
return null
352+
return skippedForSize(file.size)
330353
}
331354

332355
const raw = typeof file.content === 'string' ? file.content : ''
@@ -336,12 +359,10 @@ function fileToDocument(
336359
return null
337360
}
338361
if (buffer.byteLength > MAX_FILE_SIZE) {
339-
logger.info('Skipping oversized GitLab file', { path, size: buffer.byteLength })
340-
return null
362+
return skippedForSize(buffer.byteLength)
341363
}
342364

343365
const content = buffer.toString('utf8')
344-
const title = path.split('/').pop() || path
345366
const body = composeBody(title, content)
346367
if (!body.trim()) return null
347368

apps/sim/connectors/google-drive/google-drive.ts

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,16 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
33
import { GoogleDriveIcon } from '@/components/icons'
44
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
55
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
6-
import { htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils'
6+
import {
7+
CONNECTOR_MAX_FILE_BYTES,
8+
ConnectorFileTooLargeError,
9+
htmlToPlainText,
10+
joinTagArray,
11+
markSkipped,
12+
parseTagDate,
13+
readBodyWithLimit,
14+
sizeLimitSkipReason,
15+
} from '@/connectors/utils'
716

817
const logger = createLogger('GoogleDriveConnector')
918

@@ -22,7 +31,9 @@ const SUPPORTED_TEXT_MIME_TYPES = [
2231
'application/xml',
2332
]
2433

25-
const MAX_EXPORT_SIZE = 10 * 1024 * 1024 // 10 MB (Google export limit)
34+
// Google Drive's `files.export` API rejects exports over 10 MB (exportSizeLimitExceeded),
35+
// so this is a hard external limit for Google Workspace docs — not the connector cap.
36+
const MAX_EXPORT_SIZE = 10 * 1024 * 1024
2637

2738
function isGoogleWorkspaceFile(mimeType: string): boolean {
2839
return mimeType in GOOGLE_WORKSPACE_MIME_TYPES
@@ -50,10 +61,22 @@ async function exportGoogleWorkspaceFile(
5061
})
5162

5263
if (!response.ok) {
64+
// Google rejects exports over its 10 MB limit with a 403 exportSizeLimitExceeded
65+
// before streaming any bytes — surface that as an oversize skip, not a hard error.
66+
if (response.status === 403) {
67+
const body = await response.text().catch(() => '')
68+
if (body.includes('exportSizeLimitExceeded')) {
69+
throw new ConnectorFileTooLargeError(MAX_EXPORT_SIZE)
70+
}
71+
}
5372
throw new Error(`Failed to export file ${fileId}: ${response.status}`)
5473
}
5574

56-
return response.text()
75+
const buffer = await readBodyWithLimit(response, MAX_EXPORT_SIZE)
76+
if (!buffer) {
77+
throw new ConnectorFileTooLargeError(MAX_EXPORT_SIZE)
78+
}
79+
return buffer.toString('utf8')
5780
}
5881

5982
async function downloadTextFile(accessToken: string, fileId: string): Promise<string> {
@@ -68,15 +91,14 @@ async function downloadTextFile(accessToken: string, fileId: string): Promise<st
6891
throw new Error(`Failed to download file ${fileId}: ${response.status}`)
6992
}
7093

71-
const text = await response.text()
72-
if (Buffer.byteLength(text, 'utf8') > MAX_EXPORT_SIZE) {
73-
logger.warn(`File exceeds ${MAX_EXPORT_SIZE} bytes, truncating`)
74-
const buf = Buffer.from(text, 'utf8')
75-
let end = MAX_EXPORT_SIZE
76-
while (end > 0 && (buf[end] & 0xc0) === 0x80) end--
77-
return buf.subarray(0, end).toString('utf8')
94+
// Stream with a hard byte cap so a file with missing/under-reported listing
95+
// size metadata is never fully buffered into memory. Oversized files raise
96+
// DriveFileTooLargeError so getDocument can surface them as skipped (failed) rows.
97+
const buffer = await readBodyWithLimit(response, CONNECTOR_MAX_FILE_BYTES)
98+
if (!buffer) {
99+
throw new ConnectorFileTooLargeError(CONNECTOR_MAX_FILE_BYTES)
78100
}
79-
return text
101+
return buffer.toString('utf8')
80102
}
81103

82104
async function fetchFileContent(
@@ -327,6 +349,10 @@ export const googleDriveConnector: ConnectorConfig = {
327349
const stub = fileToStub(file)
328350
return { ...stub, content, contentDeferred: false }
329351
} catch (error) {
352+
if (error instanceof ConnectorFileTooLargeError) {
353+
logger.info('Skipping oversized Google Drive file', { fileId: file.id, name: file.name })
354+
return markSkipped(fileToStub(file), sizeLimitSkipReason(error.limitBytes))
355+
}
330356
logger.warn(`Failed to fetch content for file: ${file.name} (${file.id})`, {
331357
error: toError(error).message,
332358
})

0 commit comments

Comments
 (0)