simstudioai
diff --git a/‎apps/sim/connectors/azure-devops/azure-devops.ts‎
Lines changed: 31 additions & 3 deletions b/‎apps/sim/connectors/azure-devops/azure-devops.ts‎
Lines changed: 31 additions & 3 deletions
diff --git a/‎apps/sim/connectors/dropbox/dropbox.ts‎
Lines changed: 51 additions & 16 deletions b/‎apps/sim/connectors/dropbox/dropbox.ts‎
Lines changed: 51 additions & 16 deletions
diff --git a/‎apps/sim/connectors/github/github.ts‎
Lines changed: 31 additions & 6 deletions b/‎apps/sim/connectors/github/github.ts‎
Lines changed: 31 additions & 6 deletions
diff --git a/‎apps/sim/connectors/gitlab/gitlab.ts‎
Lines changed: 28 additions & 7 deletions b/‎apps/sim/connectors/gitlab/gitlab.ts‎
Lines changed: 28 additions & 7 deletions
diff --git a/‎apps/sim/connectors/google-drive/google-drive.ts‎
Lines changed: 37 additions & 11 deletions b/‎apps/sim/connectors/google-drive/google-drive.ts‎
Lines changed: 37 additions & 11 deletions
@@ -3,7 +3,15 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
 import { AzureDevOpsIcon } from '@/components/icons'
 import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
 import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
-import { htmlToPlainText, joinTagArray, parseTagDate, readBodyWithLimit } from '@/connectors/utils'
+import {
+  CONNECTOR_MAX_FILE_BYTES,
+  htmlToPlainText,
+  joinTagArray,
+  markSkipped,
+  parseTagDate,
+  readBodyWithLimit,
+  sizeLimitSkipReason,
+} from '@/connectors/utils'
 
 const logger = createLogger('AzureDevOpsConnector')
 
@@ -30,7 +38,7 @@ const FILE_BATCH_SIZE = 100
  * and aborts (returning null) the moment the cap is exceeded. Larger files are
  * skipped without being fully buffered.
  */
-const MAX_FILE_SIZE = 10 * 1024 * 1024
+const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
 /** Bytes sniffed for a NUL byte when detecting binary files (matches git's heuristic). */
 const BINARY_SNIFF_BYTES = 8000
 /**
@@ -1090,7 +1098,27 @@ async function getFileDocument(
   const buffer = await readBodyWithLimit(contentResponse, MAX_FILE_SIZE)
   if (buffer === null) {
     logger.info('Skipping oversized Azure DevOps file', { path })
-    return null
+    const skippedTitle = path.split('/').filter(Boolean).pop() || path
+    return markSkipped(
+      {
+        externalId,
+        title: skippedTitle,
+        content: '',
+        mimeType: 'text/plain',
+        sourceUrl: buildFileSourceUrl(repo?.webUrl, branch, path),
+        contentHash: buildFileContentHash(repoId, item.objectId),
+        metadata: {
+          kind: 'file',
+          organization,
+          project,
+          repository: repo?.name ?? '',
+          repositoryId: repoId,
+          branch,
+          path,
+        },
+      },
+      sizeLimitSkipReason(MAX_FILE_SIZE)
+    )
   }
   if (isBinaryBuffer(buffer)) {
     logger.info('Skipping binary Azure DevOps file', { path })
 
@@ -3,7 +3,16 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
 import { DropboxIcon } from '@/components/icons'
 import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
 import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
-import { htmlToPlainText, parseTagDate } from '@/connectors/utils'
+import {
+  CONNECTOR_MAX_FILE_BYTES,
+  ConnectorFileTooLargeError,
+  htmlToPlainText,
+  markSkipped,
+  parseTagDate,
+  readBodyWithLimit,
+  sizeLimitSkipReason,
+  stubOrSkipBySize,
+} from '@/connectors/utils'
 
 const logger = createLogger('DropboxConnector')
 
@@ -23,7 +32,7 @@ const SUPPORTED_EXTENSIONS = new Set([
   '.tsv',
 ])
 
-const MAX_FILE_SIZE = 10 * 1024 * 1024 // 10 MB
+const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
 
 interface DropboxFileEntry {
   '.tag': 'file' | 'folder' | 'deleted'
@@ -44,16 +53,18 @@ interface DropboxListFolderResponse {
   has_more: boolean
 }
 
-function isSupportedFile(entry: DropboxFileEntry): boolean {
-  if (entry['.tag'] !== 'file') return false
-  if (entry.is_downloadable === false) return false
-  if (entry.size && entry.size > MAX_FILE_SIZE) return false
-
-  const name = entry.name.toLowerCase()
-  const dotIndex = name.lastIndexOf('.')
+function hasSupportedExtension(name: string): boolean {
+  const lower = name.toLowerCase()
+  const dotIndex = lower.lastIndexOf('.')
   if (dotIndex === -1) return false
+  return SUPPORTED_EXTENSIONS.has(lower.slice(dotIndex))
+}
 
-  return SUPPORTED_EXTENSIONS.has(name.slice(dotIndex))
+/** A downloadable file with a supported extension, regardless of size. */
+function isDownloadableFile(entry: DropboxFileEntry): boolean {
+  return (
+    entry['.tag'] === 'file' && entry.is_downloadable !== false && hasSupportedExtension(entry.name)
+  )
 }
 
 async function downloadFileContent(accessToken: string, filePath: string): Promise<string> {
@@ -69,7 +80,15 @@ async function downloadFileContent(accessToken: string, filePath: string): Promi
     throw new Error(`Failed to download file ${filePath}: ${response.status}`)
   }
 
-  const text = await response.text()
+  // Stream with a hard byte cap so a file whose listing metadata under-reported
+  // (or omitted) its size can never be fully buffered into memory. Oversize raises
+  // so getDocument can surface it as a skipped (failed) row rather than dropping it.
+  const buffer = await readBodyWithLimit(response, MAX_FILE_SIZE)
+  if (!buffer) {
+    throw new ConnectorFileTooLargeError(MAX_FILE_SIZE)
+  }
+
+  const text = buffer.toString('utf8')
 
   if (filePath.endsWith('.html') || filePath.endsWith('.htm')) {
     return htmlToPlainText(text)
@@ -190,12 +209,16 @@ export const dropboxConnector: ConnectorConfig = {
       data = await response.json()
     }
 
-    const supportedFiles = data.entries.filter(isSupportedFile)
+    // Keep oversized files and surface them as skipped (failed) documents instead
+    // of dropping them silently at listing time.
+    const candidateFiles = data.entries.filter(isDownloadableFile)
 
     const maxFiles = sourceConfig.maxFiles ? Number(sourceConfig.maxFiles) : 0
     const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0
 
-    let documents = supportedFiles.map(fileToStub)
+    let documents = candidateFiles.map((entry) =>
+      stubOrSkipBySize(fileToStub(entry), entry.size, MAX_FILE_SIZE)
+    )
 
     if (maxFiles > 0) {
       const remaining = maxFiles - previouslyFetched
@@ -238,12 +261,24 @@ export const dropboxConnector: ConnectorConfig = {
 
       const entry = (await response.json()) as DropboxFileEntry
 
-      if (!isSupportedFile(entry)) return null
+      if (!isDownloadableFile(entry)) return null
+
+      const stub = fileToStub(entry)
+      if (entry.size && entry.size > MAX_FILE_SIZE) {
+        return markSkipped(stub, sizeLimitSkipReason(MAX_FILE_SIZE))
+      }
 
-      const content = await downloadFileContent(accessToken, entry.path_lower)
+      let content: string
+      try {
+        content = await downloadFileContent(accessToken, entry.path_lower)
+      } catch (error) {
+        if (error instanceof ConnectorFileTooLargeError) {
+          return markSkipped(stub, sizeLimitSkipReason(error.limitBytes))
+        }
+        throw error
+      }
       if (!content.trim()) return null
 
-      const stub = fileToStub(entry)
       return { ...stub, content, contentDeferred: false }
     } catch (error) {
       logger.warn(`Failed to fetch document ${externalId}`, {
 
@@ -3,14 +3,20 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
 import { GithubIcon } from '@/components/icons'
 import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
 import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
-import { parseTagDate } from '@/connectors/utils'
+import {
+  CONNECTOR_MAX_FILE_BYTES,
+  markSkipped,
+  parseTagDate,
+  sizeLimitSkipReason,
+  stubOrSkipBySize,
+} from '@/connectors/utils'
 
 const logger = createLogger('GitHubConnector')
 
 const GITHUB_API_URL = 'https://api.github.com'
 const BATCH_SIZE = 30
 const GIT_SHA_PREFIX = 'git-sha:'
-const MAX_FILE_SIZE = 10 * 1024 * 1024 // 10 MB
+const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
 const BINARY_SNIFF_BYTES = 8000
 
 /**
@@ -245,11 +251,11 @@ export const githubConnector: ConnectorConfig = {
     } else {
       const tree = await fetchTree(accessToken, owner, repo, branch)
 
-      // Filter by path prefix, extensions, and size
+      // Filter by path prefix and extensions. Oversized files are kept here and
+      // surfaced as skipped (failed) documents at stub time so they stay visible.
       const filtered = tree.filter((item) => {
         if (pathPrefix && !item.path.startsWith(pathPrefix)) return false
         if (!matchesExtension(item.path, extSet)) return false
-        if (typeof item.size === 'number' && item.size > MAX_FILE_SIZE) return false
         return true
       })
 
@@ -271,7 +277,9 @@ export const githubConnector: ConnectorConfig = {
       batchSize: batch.length,
     })
 
-    const documents = batch.map((item) => treeItemToStub(owner, repo, branch, item))
+    const documents = batch.map((item) =>
+      stubOrSkipBySize(treeItemToStub(owner, repo, branch, item), item.size, MAX_FILE_SIZE)
+    )
 
     const nextOffset = offset + BATCH_SIZE
     const hasMore = nextOffset < capped.length
@@ -329,7 +337,24 @@ export const githubConnector: ConnectorConfig = {
           size,
           limit: MAX_FILE_SIZE,
         })
-        return null
+        return markSkipped(
+          {
+            externalId,
+            title: path.split('/').pop() || path,
+            content: '',
+            mimeType: 'text/plain',
+            sourceUrl: `https://github.com/${owner}/${repo}/blob/${branch.split('/').map(encodeURIComponent).join('/')}/${path.split('/').map(encodeURIComponent).join('/')}`,
+            contentHash: `${GIT_SHA_PREFIX}${data.sha as string}`,
+            metadata: {
+              path,
+              sha: data.sha as string,
+              size,
+              branch,
+              repository: `${owner}/${repo}`,
+            },
+          },
+          sizeLimitSkipReason(MAX_FILE_SIZE)
+        )
       }
 
       const rawContent = (data.content as string) || ''
 
@@ -6,14 +6,21 @@ import { isSameOrigin } from '@/lib/core/utils/validation'
 import { secureFetchWithRetry } from '@/lib/knowledge/documents/secure-fetch.server'
 import { VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
 import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
-import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
+import {
+  CONNECTOR_MAX_FILE_BYTES,
+  computeContentHash,
+  joinTagArray,
+  markSkipped,
+  parseTagDate,
+  sizeLimitSkipReason,
+} from '@/connectors/utils'
 
 const logger = createLogger('GitLabConnector')
 
 const DEFAULT_HOST = 'gitlab.com'
 const PAGE_SIZE = 100
 /** Max repository file size to index. Larger blobs are skipped. */
-const MAX_FILE_SIZE = 10 * 1024 * 1024
+const MAX_FILE_SIZE = CONNECTOR_MAX_FILE_BYTES
 /** Bytes sniffed for NUL when detecting binary files (matches git's heuristic). */
 const BINARY_SNIFF_BYTES = 8000
 
@@ -324,9 +331,25 @@ function fileToDocument(
   const blobSha = file.blob_id?.trim()
   if (!blobSha) return null
 
+  const title = path.split('/').pop() || path
+  const skippedForSize = (size: number): ExternalDocument => {
+    logger.info('Skipping oversized GitLab file', { path, size })
+    return markSkipped(
+      {
+        externalId: `${FILE_PREFIX}${path}`,
+        title,
+        content: '',
+        mimeType: 'text/plain',
+        sourceUrl: buildFileSourceUrl(apiBase, encodedProject, host, projectPath, ref, path),
+        contentHash: buildFileContentHash(encodedProject, path, blobSha),
+        metadata: { contentType: 'file', title, path, size },
+      },
+      sizeLimitSkipReason(MAX_FILE_SIZE)
+    )
+  }
+
   if (typeof file.size === 'number' && file.size > MAX_FILE_SIZE) {
-    logger.info('Skipping oversized GitLab file', { path, size: file.size })
-    return null
+    return skippedForSize(file.size)
   }
 
   const raw = typeof file.content === 'string' ? file.content : ''
@@ -336,12 +359,10 @@ function fileToDocument(
     return null
   }
   if (buffer.byteLength > MAX_FILE_SIZE) {
-    logger.info('Skipping oversized GitLab file', { path, size: buffer.byteLength })
-    return null
+    return skippedForSize(buffer.byteLength)
   }
 
   const content = buffer.toString('utf8')
-  const title = path.split('/').pop() || path
   const body = composeBody(title, content)
   if (!body.trim()) return null
 
 
@@ -3,7 +3,16 @@ import { getErrorMessage, toError } from '@sim/utils/errors'
 import { GoogleDriveIcon } from '@/components/icons'
 import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
 import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
-import { htmlToPlainText, joinTagArray, parseTagDate } from '@/connectors/utils'
+import {
+  CONNECTOR_MAX_FILE_BYTES,
+  ConnectorFileTooLargeError,
+  htmlToPlainText,
+  joinTagArray,
+  markSkipped,
+  parseTagDate,
+  readBodyWithLimit,
+  sizeLimitSkipReason,
+} from '@/connectors/utils'
 
 const logger = createLogger('GoogleDriveConnector')
 
@@ -22,7 +31,9 @@ const SUPPORTED_TEXT_MIME_TYPES = [
   'application/xml',
 ]
 
-const MAX_EXPORT_SIZE = 10 * 1024 * 1024 // 10 MB (Google export limit)
+// Google Drive's `files.export` API rejects exports over 10 MB (exportSizeLimitExceeded),
+// so this is a hard external limit for Google Workspace docs — not the connector cap.
+const MAX_EXPORT_SIZE = 10 * 1024 * 1024
 
 function isGoogleWorkspaceFile(mimeType: string): boolean {
   return mimeType in GOOGLE_WORKSPACE_MIME_TYPES
@@ -50,10 +61,22 @@ async function exportGoogleWorkspaceFile(
   })
 
   if (!response.ok) {
+    // Google rejects exports over its 10 MB limit with a 403 exportSizeLimitExceeded
+    // before streaming any bytes — surface that as an oversize skip, not a hard error.
+    if (response.status === 403) {
+      const body = await response.text().catch(() => '')
+      if (body.includes('exportSizeLimitExceeded')) {
+        throw new ConnectorFileTooLargeError(MAX_EXPORT_SIZE)
+      }
+    }
     throw new Error(`Failed to export file ${fileId}: ${response.status}`)
   }
 
-  return response.text()
+  const buffer = await readBodyWithLimit(response, MAX_EXPORT_SIZE)
+  if (!buffer) {
+    throw new ConnectorFileTooLargeError(MAX_EXPORT_SIZE)
+  }
+  return buffer.toString('utf8')
 }
 
 async function downloadTextFile(accessToken: string, fileId: string): Promise<string> {
@@ -68,15 +91,14 @@ async function downloadTextFile(accessToken: string, fileId: string): Promise<st
     throw new Error(`Failed to download file ${fileId}: ${response.status}`)
   }
 
-  const text = await response.text()
-  if (Buffer.byteLength(text, 'utf8') > MAX_EXPORT_SIZE) {
-    logger.warn(`File exceeds ${MAX_EXPORT_SIZE} bytes, truncating`)
-    const buf = Buffer.from(text, 'utf8')
-    let end = MAX_EXPORT_SIZE
-    while (end > 0 && (buf[end] & 0xc0) === 0x80) end--
-    return buf.subarray(0, end).toString('utf8')
+  // Stream with a hard byte cap so a file with missing/under-reported listing
+  // size metadata is never fully buffered into memory. Oversized files raise
+  // DriveFileTooLargeError so getDocument can surface them as skipped (failed) rows.
+  const buffer = await readBodyWithLimit(response, CONNECTOR_MAX_FILE_BYTES)
+  if (!buffer) {
+    throw new ConnectorFileTooLargeError(CONNECTOR_MAX_FILE_BYTES)
   }
-  return text
+  return buffer.toString('utf8')
 }
 
 async function fetchFileContent(
@@ -327,6 +349,10 @@ export const googleDriveConnector: ConnectorConfig = {
       const stub = fileToStub(file)
       return { ...stub, content, contentDeferred: false }
     } catch (error) {
+      if (error instanceof ConnectorFileTooLargeError) {
+        logger.info('Skipping oversized Google Drive file', { fileId: file.id, name: file.name })
+        return markSkipped(fileToStub(file), sizeLimitSkipReason(error.limitBytes))
+      }
       logger.warn(`Failed to fetch content for file: ${file.name} (${file.id})`, {
         error: toError(error).message,
       })