Skip to content

Commit 089212f

Browse files
authored
Using character-based token calculation (#70)
Signed-off-by: Denis Jannot <denis.jannot@solo.io>
1 parent 7319fe3 commit 089212f

3 files changed

Lines changed: 26 additions & 27 deletions

File tree

content-processor.ts

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1997,9 +1997,9 @@ export class ContentProcessor {
19971997
async chunkMarkdown(markdown: string, sourceConfig: SourceConfig, url: string): Promise<DocumentChunk[]> {
19981998
const logger = this.logger.child('chunker');
19991999

2000-
// --- Configuration ---
2001-
const MAX_TOKENS = 1000;
2002-
const MIN_TOKENS = 150; // 💡 Merges "OpenAI-compatible" sentence into the next block
2000+
// --- Configuration (character-based, ~4 chars ≈ 1 token) ---
2001+
const MAX_CHARS = 4000;
2002+
const MIN_CHARS = 600; // 💡 Merges short sections into the next block
20032003
const OVERLAP_PERCENT = 0.1; // 10% overlap for large splits
20042004

20052005
const chunks: DocumentChunk[] = [];
@@ -2071,39 +2071,37 @@ export class ContentProcessor {
20712071

20722072
/**
20732073
* Flushes the current buffer into the chunks array.
2074-
* Uses sub-splitting logic if the buffer exceeds MAX_TOKENS.
2074+
* Uses sub-splitting logic if the buffer exceeds MAX_CHARS.
20752075
*/
20762076
const flushBuffer = (force = false) => {
20772077
const trimmedBuffer = buffer.trim();
20782078
if (!trimmedBuffer) return;
2079-
2080-
const tokenCount = Utils.tokenize(trimmedBuffer).length;
2081-
2079+
2080+
const charCount = trimmedBuffer.length;
2081+
20822082
// 💡 SEMANTIC MERGING
20832083
// If the current section is too short (like just a title or a one-liner),
20842084
// we don't flush yet unless it's the end of the file (force=true).
2085-
if (tokenCount < MIN_TOKENS && !force) {
2086-
return;
2085+
if (charCount < MIN_CHARS && !force) {
2086+
return;
20872087
}
2088-
2088+
20892089
// Compute the appropriate topic hierarchy for merged content
20902090
const topicHierarchy = computeTopicHierarchy();
2091-
2092-
if (tokenCount > MAX_TOKENS) {
2093-
// 💡 RECURSIVE OVERLAP SPLITTING
2091+
2092+
if (charCount > MAX_CHARS) {
2093+
// 💡 OVERLAP SPLITTING
20942094
// If the section is a massive guide, split it but keep headers on every sub-piece.
2095-
const tokens = Utils.tokenize(trimmedBuffer);
2096-
const overlapSize = Math.floor(MAX_TOKENS * OVERLAP_PERCENT);
2097-
2098-
for (let i = 0; i < tokens.length; i += (MAX_TOKENS - overlapSize)) {
2099-
const subTokens = tokens.slice(i, i + MAX_TOKENS);
2100-
const subContent = subTokens.join("");
2095+
const overlapSize = Math.floor(MAX_CHARS * OVERLAP_PERCENT);
2096+
2097+
for (let i = 0; i < trimmedBuffer.length; i += (MAX_CHARS - overlapSize)) {
2098+
const subContent = trimmedBuffer.slice(i, i + MAX_CHARS);
21012099
chunks.push(createDocumentChunk(subContent, topicHierarchy));
21022100
}
21032101
} else {
21042102
chunks.push(createDocumentChunk(trimmedBuffer, topicHierarchy));
21052103
}
2106-
2104+
21072105
buffer = ""; // Reset buffer after successful flush
21082106
bufferHeadings = []; // Reset tracked headings
21092107
};
@@ -2124,9 +2122,9 @@ export class ContentProcessor {
21242122
.trim();
21252123

21262124
// Check if we should merge with previous content
2127-
const currentTokenCount = Utils.tokenize(buffer.trim()).length;
2128-
const hasBufferContent = currentTokenCount > 0;
2129-
const bufferIsSmall = currentTokenCount < MIN_TOKENS;
2125+
const currentCharCount = buffer.trim().length;
2126+
const hasBufferContent = currentCharCount > 0;
2127+
const bufferIsSmall = currentCharCount < MIN_CHARS;
21302128

21312129
// Only merge if:
21322130
// 1. Buffer has content and is small
@@ -2157,7 +2155,7 @@ export class ContentProcessor {
21572155
buffer += `${line}\n`;
21582156

21592157
// Safety valve: if a single section is huge, flush it periodically
2160-
if (Utils.tokenize(buffer).length >= MAX_TOKENS) {
2158+
if (buffer.length >= MAX_CHARS) {
21612159
flushBuffer();
21622160
}
21632161
}

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "doc2vec",
3-
"version": "2.9.1",
3+
"version": "2.9.2",
44
"type": "commonjs",
55
"description": "",
66
"main": "dist/doc2vec.js",

tests/doc2vec.test.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ sources:
651651
expect(mockEmbeddingsCreate).toHaveBeenCalledWith({
652652
model: 'text-embedding-3-large',
653653
input: ['test text'],
654-
});
654+
}, { timeout: 60000 });
655655
});
656656

657657
it('should return multiple embeddings for multiple texts', async () => {
@@ -1369,7 +1369,8 @@ sources:
13691369
await (instance as any).createEmbeddings(['hello']);
13701370

13711371
expect(mockCreate).toHaveBeenCalledWith(
1372-
expect.objectContaining({ model: 'text-embedding-3-large' })
1372+
expect.objectContaining({ model: 'text-embedding-3-large' }),
1373+
{ timeout: 60000 }
13731374
);
13741375
});
13751376
});

0 commit comments

Comments
 (0)