@@ -25,7 +25,8 @@ import {
2525 ZendeskSourceConfig ,
2626 DatabaseConnection ,
2727 DocumentChunk ,
28- BrokenLink
28+ BrokenLink ,
29+ EmbeddingConfig
2930} from './types' ;
3031
3132const GITHUB_TOKEN = process . env . GITHUB_PERSONAL_ACCESS_TOKEN ;
@@ -37,6 +38,7 @@ export class Doc2Vec {
3738 private config : Config ;
3839 private openai : OpenAI | AzureOpenAI ;
3940 private embeddingModel : string ;
41+ private embeddingDimension : number ;
4042 private contentProcessor : ContentProcessor ;
4143 private logger : Logger ;
4244 private configDir : string ;
@@ -58,6 +60,7 @@ export class Doc2Vec {
5860 // Check environment variable if not specified in config
5961 const embeddingProvider = this . config . embedding ?. provider || ( process . env . EMBEDDING_PROVIDER as 'openai' | 'azure' ) || 'openai' ;
6062 const embeddingConfig = this . config . embedding || { provider : embeddingProvider } ;
63+ this . embeddingDimension = this . resolveEmbeddingDimension ( embeddingConfig ) ;
6164
6265 if ( embeddingProvider === 'azure' ) {
6366 const azureApiKey = embeddingConfig . azure ?. api_key || process . env . AZURE_OPENAI_KEY ;
@@ -77,7 +80,7 @@ export class Doc2Vec {
7780 apiVersion : azureApiVersion ,
7881 } ) ;
7982 this . embeddingModel = azureDeploymentName ;
80- this . logger . info ( `Using Azure OpenAI with deployment: ${ azureDeploymentName } ` ) ;
83+ this . logger . info ( `Using Azure OpenAI with deployment: ${ azureDeploymentName } ( ${ this . embeddingDimension } dimensions) ` ) ;
8184 } else {
8285 const openaiApiKey = embeddingConfig . openai ?. api_key || process . env . OPENAI_API_KEY ;
8386 const openaiModel = embeddingConfig . openai ?. model || process . env . OPENAI_MODEL || 'text-embedding-3-large' ;
@@ -89,7 +92,7 @@ export class Doc2Vec {
8992
9093 this . openai = new OpenAI ( { apiKey : openaiApiKey } ) ;
9194 this . embeddingModel = openaiModel ;
92- this . logger . info ( `Using OpenAI with model: ${ openaiModel } ` ) ;
95+ this . logger . info ( `Using OpenAI with model: ${ openaiModel } ( ${ this . embeddingDimension } dimensions) ` ) ;
9396 }
9497
9598 this . contentProcessor = new ContentProcessor ( this . logger ) ;
@@ -138,6 +141,25 @@ export class Doc2Vec {
138141 }
139142 }
140143
144+ private resolveEmbeddingDimension ( embeddingConfig : EmbeddingConfig | undefined ) : number {
145+ const defaultDimension = 3072 ;
146+ const rawConfigValue = embeddingConfig ?. dimension ;
147+ const rawEnvValue = process . env . EMBEDDING_DIMENSION ;
148+
149+ const candidate = rawConfigValue ?? ( rawEnvValue ? Number ( rawEnvValue ) : undefined ) ;
150+ if ( candidate === undefined ) {
151+ return defaultDimension ;
152+ }
153+
154+ const parsedValue = typeof candidate === 'string' ? Number ( candidate ) : candidate ;
155+ if ( ! Number . isFinite ( parsedValue ) || parsedValue <= 0 || ! Number . isInteger ( parsedValue ) ) {
156+ this . logger . warn ( `Invalid embedding dimension provided (${ candidate } ), falling back to ${ defaultDimension } ` ) ;
157+ return defaultDimension ;
158+ }
159+
160+ return parsedValue ;
161+ }
162+
141163 public async run ( ) : Promise < void > {
142164 this. logger . section ( 'PROCESSING SOURCES' ) ;
143165
@@ -388,7 +410,7 @@ export class Doc2Vec {
388410 }
389411
390412 // Update the last run date in the database after processing all issues
391- await DatabaseManager . updateLastRunDate ( dbConnection , repo , logger ) ;
413+ await DatabaseManager . updateLastRunDate ( dbConnection , repo , logger , this . embeddingDimension ) ;
392414
393415 logger . info ( `Successfully processed ${ issues . length } issues` ) ;
394416 }
@@ -397,7 +419,7 @@ export class Doc2Vec {
397419 const logger = parentLogger . child ( 'process' ) ;
398420 logger . info ( `Starting processing for GitHub repo: ${ config . repo } ` ) ;
399421
400- const dbConnection = await DatabaseManager . initDatabase ( config , logger ) ;
422+ const dbConnection = await DatabaseManager . initDatabase ( config , logger , this . embeddingDimension ) ;
401423
402424 // Initialize metadata storage
403425 await DatabaseManager . initDatabaseMetadata ( dbConnection , logger ) ;
@@ -414,8 +436,8 @@ export class Doc2Vec {
414436 const logger = parentLogger . child ( 'process' ) ;
415437 logger . info ( `Starting processing for website: ${ config . url } ` ) ;
416438
417- const dbConnection = await DatabaseManager . initDatabase ( config , logger ) ;
418- await DatabaseManager . initDatabaseMetadata ( dbConnection , logger ) ;
439+ const dbConnection = await DatabaseManager . initDatabase ( config , logger , this . embeddingDimension ) ;
440+ await DatabaseManager . initDatabaseMetadata ( dbConnection , logger ) ;
419441 const validChunkIds : Set < string > = new Set ( ) ;
420442 const visitedUrls : Set < string > = new Set ( ) ;
421443 const urlPrefix = Utils . getUrlPrefix ( config . url ) ;
@@ -446,7 +468,7 @@ export class Doc2Vec {
446468 return DatabaseManager . getMetadataValue ( dbConnection , `etag:${ url } ` , undefined , logger ) ;
447469 } ,
448470 set : async ( url : string , etag : string ) : Promise < void > => {
449- await DatabaseManager . setMetadataValue ( dbConnection , `etag:${ url } ` , etag , logger ) ;
471+ await DatabaseManager . setMetadataValue ( dbConnection , `etag:${ url } ` , etag , logger , this . embeddingDimension ) ;
450472 } ,
451473 } ;
452474
@@ -455,7 +477,7 @@ export class Doc2Vec {
455477 return DatabaseManager . getMetadataValue ( dbConnection , `lastmod:${ url } ` , undefined , logger ) ;
456478 } ,
457479 set : async ( url : string , lastmod : string ) : Promise < void > => {
458- await DatabaseManager . setMetadataValue ( dbConnection , `lastmod:${ url } ` , lastmod , logger ) ;
480+ await DatabaseManager . setMetadataValue ( dbConnection , `lastmod:${ url } ` , lastmod , logger , this . embeddingDimension ) ;
459481 } ,
460482 } ;
461483
@@ -539,7 +561,7 @@ export class Doc2Vec {
539561 const logger = parentLogger.child('process');
540562 logger.info(` Starting processing for local directory : ${config . path } `);
541563
542- const dbConnection = await DatabaseManager . initDatabase ( config , logger ) ;
564+ const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension );
543565 const validChunkIds: Set<string> = new Set();
544566 const processedFiles: Set<string> = new Set();
545567
@@ -611,7 +633,7 @@ export class Doc2Vec {
611633 const logger = parentLogger.child('process');
612634 logger.info(` Starting processing for code source ( $ { config . source } ) `);
613635
614- const dbConnection = await DatabaseManager . initDatabase ( config , logger ) ;
636+ const dbConnection = await DatabaseManager.initDatabase(config, logger, this.embeddingDimension );
615637 const validChunkIds: Set<string> = new Set();
616638 const processedFiles: Set<string> = new Set();
617639
@@ -765,10 +787,10 @@ export class Doc2Vec {
765787 }
766788 }
767789
768- await DatabaseManager . setMetadataValue ( dbConnection , fileListKey , JSON . stringify ( currentList ) , logger ) ;
790+ await DatabaseManager.setMetadataValue(dbConnection, fileListKey, JSON.stringify(currentList), logger, this.embeddingDimension );
769791 if (lastMtimeKey) {
770792 const nextMtime = maxObservedMtime > 0 ? maxObservedMtime : Date.now();
771- await DatabaseManager . setMetadataValue ( dbConnection , lastMtimeKey , `${ nextMtime } ` , logger ) ;
793+ await DatabaseManager.setMetadataValue(dbConnection, lastMtimeKey, ` $ { nextMtime } `, logger, this.embeddingDimension );
772794 }
773795 }
774796 } else {
@@ -785,7 +807,7 @@ export class Doc2Vec {
785807 const headSha = await this.getRepoHeadSha(basePath, logger);
786808 if (headSha) {
787809 const shaKey = this.buildCodeShaMetadataKey(config.repo as string, repoBranch);
788- await DatabaseManager . setMetadataValue ( dbConnection , shaKey , headSha , logger ) ;
810+ await DatabaseManager.setMetadataValue(dbConnection, shaKey, headSha, logger, this.embeddingDimension );
789811 }
790812 }
791813
@@ -974,7 +996,7 @@ export class Doc2Vec {
974996 const logger = parentLogger . child ( 'process' ) ;
975997 logger . info ( `Starting processing for Zendesk: ${ config . zendesk_subdomain } .zendesk.com` ) ;
976998
977- const dbConnection = await DatabaseManager . initDatabase ( config , logger ) ;
999+ const dbConnection = await DatabaseManager . initDatabase ( config , logger , this . embeddingDimension ) ;
9781000
9791001 // Initialize metadata storage
9801002 await DatabaseManager . initDatabaseMetadata ( dbConnection , logger ) ;
@@ -1180,7 +1202,7 @@ export class Doc2Vec {
11801202 }
11811203
11821204 // Update the last run date in the database
1183- await DatabaseManager . updateLastRunDate ( dbConnection , `zendesk_tickets_$ { config . zendesk_subdomain } `, logger);
1205+ await DatabaseManager . updateLastRunDate ( dbConnection , `zendesk_tickets_${ config . zendesk_subdomain } ` , logger , this . embeddingDimension ) ;
11841206
11851207 logger . info ( `Successfully processed ${ totalTickets } tickets` ) ;
11861208 }
0 commit comments