@@ -1997,9 +1997,9 @@ export class ContentProcessor {
19971997 async chunkMarkdown ( markdown : string , sourceConfig : SourceConfig , url : string ) : Promise < DocumentChunk [ ] > {
19981998 const logger = this . logger . child ( 'chunker' ) ;
19991999
2000- // --- Configuration ---
2001- const MAX_TOKENS = 1000 ;
2002- const MIN_TOKENS = 150 ; // 💡 Merges "OpenAI-compatible" sentence into the next block
2000+ // --- Configuration (character-based, ~4 chars ≈ 1 token) ---
2001+ const MAX_CHARS = 4000 ;
2002+ const MIN_CHARS = 600 ; // 💡 Merges short sections into the next block
20032003 const OVERLAP_PERCENT = 0.1 ; // 10% overlap for large splits
20042004
20052005 const chunks : DocumentChunk [ ] = [ ] ;
@@ -2071,39 +2071,37 @@ export class ContentProcessor {
20712071
20722072 /**
20732073 * Flushes the current buffer into the chunks array.
2074- * Uses sub-splitting logic if the buffer exceeds MAX_TOKENS .
2074+ * Uses sub-splitting logic if the buffer exceeds MAX_CHARS .
20752075 */
20762076 const flushBuffer = ( force = false ) => {
20772077 const trimmedBuffer = buffer . trim ( ) ;
20782078 if ( ! trimmedBuffer ) return ;
2079-
2080- const tokenCount = Utils . tokenize ( trimmedBuffer ) . length ;
2081-
2079+
2080+ const charCount = trimmedBuffer . length ;
2081+
20822082 // 💡 SEMANTIC MERGING
20832083 // If the current section is too short (like just a title or a one-liner),
20842084 // we don't flush yet unless it's the end of the file (force=true).
2085- if ( tokenCount < MIN_TOKENS && ! force ) {
2086- return ;
2085+ if ( charCount < MIN_CHARS && ! force ) {
2086+ return ;
20872087 }
2088-
2088+
20892089 // Compute the appropriate topic hierarchy for merged content
20902090 const topicHierarchy = computeTopicHierarchy ( ) ;
2091-
2092- if ( tokenCount > MAX_TOKENS ) {
2093- // 💡 RECURSIVE OVERLAP SPLITTING
2091+
2092+ if ( charCount > MAX_CHARS ) {
2093+ // 💡 OVERLAP SPLITTING
20942094 // If the section is a massive guide, split it but keep headers on every sub-piece.
2095- const tokens = Utils . tokenize ( trimmedBuffer ) ;
2096- const overlapSize = Math . floor ( MAX_TOKENS * OVERLAP_PERCENT ) ;
2097-
2098- for ( let i = 0 ; i < tokens . length ; i += ( MAX_TOKENS - overlapSize ) ) {
2099- const subTokens = tokens . slice ( i , i + MAX_TOKENS ) ;
2100- const subContent = subTokens . join ( "" ) ;
2095+ const overlapSize = Math . floor ( MAX_CHARS * OVERLAP_PERCENT ) ;
2096+
2097+ for ( let i = 0 ; i < trimmedBuffer . length ; i += ( MAX_CHARS - overlapSize ) ) {
2098+ const subContent = trimmedBuffer . slice ( i , i + MAX_CHARS ) ;
21012099 chunks . push ( createDocumentChunk ( subContent , topicHierarchy ) ) ;
21022100 }
21032101 } else {
21042102 chunks . push ( createDocumentChunk ( trimmedBuffer , topicHierarchy ) ) ;
21052103 }
2106-
2104+
21072105 buffer = "" ; // Reset buffer after successful flush
21082106 bufferHeadings = [ ] ; // Reset tracked headings
21092107 } ;
@@ -2124,9 +2122,9 @@ export class ContentProcessor {
21242122 . trim ( ) ;
21252123
21262124 // Check if we should merge with previous content
2127- const currentTokenCount = Utils . tokenize ( buffer . trim ( ) ) . length ;
2128- const hasBufferContent = currentTokenCount > 0 ;
2129- const bufferIsSmall = currentTokenCount < MIN_TOKENS ;
2125+ const currentCharCount = buffer . trim ( ) . length ;
2126+ const hasBufferContent = currentCharCount > 0 ;
2127+ const bufferIsSmall = currentCharCount < MIN_CHARS ;
21302128
21312129 // Only merge if:
21322130 // 1. Buffer has content and is small
@@ -2157,7 +2155,7 @@ export class ContentProcessor {
21572155 buffer += `${ line } \n` ;
21582156
21592157 // Safety valve: if a single section is huge, flush it periodically
2160- if ( Utils . tokenize ( buffer ) . length >= MAX_TOKENS ) {
2158+ if ( buffer . length >= MAX_CHARS ) {
21612159 flushBuffer ( ) ;
21622160 }
21632161 }
0 commit comments