diff --git a/.vitepress/config/apiReferenceSidebar.ts b/.vitepress/config/apiReferenceSidebar.ts index 6b8c35ed..428d3aae 100644 --- a/.vitepress/config/apiReferenceSidebar.ts +++ b/.vitepress/config/apiReferenceSidebar.ts @@ -53,6 +53,7 @@ const chatWrappersOrder = [ "Llama3ChatWrapper", "Llama2ChatWrapper", "MistralChatWrapper", + "Gemma4ChatWrapper", "GemmaChatWrapper", "ChatMLChatWrapper", "FalconChatWrapper", diff --git a/llama/CMakeLists.txt b/llama/CMakeLists.txt index efd8fce4..49d0d638 100644 --- a/llama/CMakeLists.txt +++ b/llama/CMakeLists.txt @@ -120,8 +120,8 @@ list(REMOVE_DUPLICATES GPU_INFO_HEADERS) list(REMOVE_DUPLICATES GPU_INFO_SOURCES) list(REMOVE_DUPLICATES GPU_INFO_EXTRA_LIBS) -addVariantSuffix(llama ${NLC_VARIANT}) -addVariantSuffix(ggml ${NLC_VARIANT}) +addVariantSuffix(llama "${NLC_VARIANT}") +addVariantSuffix(ggml "${NLC_VARIANT}") file(GLOB SOURCE_FILES "addon/*.cpp" "addon/**/*.cpp" ${GPU_INFO_SOURCES}) @@ -129,7 +129,7 @@ add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC} ${GPU_INFO_HE set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node") target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB}) target_link_libraries(${PROJECT_NAME} "llama") -target_link_libraries(${PROJECT_NAME} "common") +target_link_libraries(${PROJECT_NAME} "llama-common") if (DEFINED GPU_INFO_EXTRA_LIBS) target_link_libraries(${PROJECT_NAME} ${GPU_INFO_EXTRA_LIBS}) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 017c6967..f4395a44 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -2,6 +2,7 @@ #include #include #include "common/common.h" +#include "llama-context.h" #include "llama-vocab.h" #include "llama.h" @@ -107,7 +108,7 @@ class AddonContextLoadContextWorker : public Napi::AsyncWorker { try { context->ctx = llama_init_from_model(context->model->model, context->context_params); - context->contextLoaded = context->ctx != nullptr && context->ctx != NULL; + context->contextLoaded = context->ctx != nullptr; } catch (const std::exception& e) { SetError(e.what()); } catch(...) { @@ -115,7 +116,7 @@ class AddonContextLoadContextWorker : public Napi::AsyncWorker { } } void OnOK() { - if (context->contextLoaded) { + if (context->contextLoaded && !context->model->model_params.no_alloc) { uint64_t contextMemorySize = llama_state_get_size(context->ctx); adjustNapiExternalMemoryAdd(Env(), contextMemorySize); context->loadedContextMemorySize = contextMemorySize; @@ -173,8 +174,10 @@ class AddonContextUnloadContextWorker : public Napi::AsyncWorker { } } void OnOK() { - adjustNapiExternalMemorySubtract(Env(), context->loadedContextMemorySize); - context->loadedContextMemorySize = 0; + if (!context->model->model_params.no_alloc) { + adjustNapiExternalMemorySubtract(Env(), context->loadedContextMemorySize); + context->loadedContextMemorySize = 0; + } adjustNapiExternalMemorySubtract(Env(), context->batchMemorySize); context->batchMemorySize = 0; @@ -251,22 +254,8 @@ class AddonContextSampleTokenWorker : public Napi::AsyncWorker { sampler->rebuildChainIfNeeded(); - const auto * logits = llama_get_logits_ith(ctx->ctx, batchLogitIndex); - const int n_vocab = llama_vocab_n_tokens(ctx->model->vocab); - - auto & candidates = sampler->tokenCandidates; - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; - } - - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(sampler->chain, &cur_p); + llama_token_data_array cur_p; + sampler->sample(ctx->ctx, batchLogitIndex, cur_p, returnProbabilities || returnConfidence); if (!(cur_p.selected >= 0 && cur_p.selected < (int32_t)cur_p.size)) { no_output = true; @@ -397,13 +386,13 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap 1 && info[1].IsObject()) { - Napi::Object options = info[1].As(); + const auto options = info[1].As(); if (options.Has("contextSize")) { context_params.n_ctx = options.Get("contextSize").As().Uint32Value(); @@ -427,16 +416,26 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value(); - context_params.flash_attn_type = flashAttention ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED; + const auto flashAttention = options.Get("flashAttention"); + + if (flashAttention.IsString() && flashAttention.As().Utf8Value() == "auto") { + context_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; + } else { + const bool flashAttentionEnabled = flashAttention.As().Value(); + context_params.flash_attn_type = flashAttentionEnabled + ? LLAMA_FLASH_ATTN_TYPE_ENABLED + : LLAMA_FLASH_ATTN_TYPE_DISABLED; + } } if (options.Has("threads")) { - const auto n_threads = options.Get("threads").As().Int32Value(); - const auto resolved_n_threads = n_threads == 0 ? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads) : n_threads; + const auto threads = options.Get("threads").As().Int32Value(); + const auto resolvedThreads = threads == 0 + ? std::max((int32_t)std::thread::hardware_concurrency(), context_params.n_threads) + : threads; - context_params.n_threads = resolved_n_threads; - context_params.n_threads_batch = resolved_n_threads; + context_params.n_threads = resolvedThreads; + context_params.n_threads_batch = resolvedThreads; } if (options.Has("performanceTracking")) { @@ -444,14 +443,14 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Int32Value(); + const auto keyType = options.Get("kvCacheKeyType").As().Int32Value(); if (keyType >= 0 && keyType < GGML_TYPE_COUNT) { context_params.type_k = static_cast(keyType); } } if (options.Has("kvCacheValueType") && options.Get("kvCacheValueType").IsNumber()) { - auto valueType = options.Get("kvCacheValueType").As().Int32Value(); + const auto valueType = options.Get("kvCacheValueType").As().Int32Value(); if (valueType >= 0 && valueType < GGML_TYPE_COUNT) { context_params.type_v = static_cast(valueType); } @@ -476,8 +475,10 @@ void AddonContext::dispose() { contextLoaded = false; llama_free(ctx); - adjustNapiExternalMemorySubtract(Env(), loadedContextMemorySize); - loadedContextMemorySize = 0; + if (!model->model_params.no_alloc) { + adjustNapiExternalMemorySubtract(Env(), loadedContextMemorySize); + loadedContextMemorySize = 0; + } } model->Unref(); @@ -728,6 +729,49 @@ Napi::Value AddonContext::GetStateSize(const Napi::CallbackInfo& info) { return Napi::Number::From(info.Env(), llama_state_get_size(ctx)); } +Napi::Value AddonContext::GetMemoryBreakdown(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + if (!contextLoaded || ctx == nullptr) { + Napi::Error::New(info.Env(), "Context is not loaded").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + std::size_t cpuRam = 0; + std::size_t gpuVram = 0; + + for (const auto& [bufferType, memoryBreakdown] : ctx->memory_breakdown()) { + const std::size_t size = memoryBreakdown.context + memoryBreakdown.compute; + if (size == 0) { + continue; + } + + if (ggml_backend_buft_is_host(bufferType)) { + cpuRam += size; + } else { + ggml_backend_dev_t device = ggml_backend_buft_get_device(bufferType); + if (device != nullptr) { + auto deviceType = ggml_backend_dev_type(device); + if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) { + gpuVram += size; + } else { + cpuRam += size; + } + } else { + cpuRam += size; + } + } + } + + Napi::Object result = Napi::Object::New(info.Env()); + result.Set("cpuRam", Napi::Number::New(info.Env(), cpuRam)); + result.Set("gpuVram", Napi::Number::New(info.Env(), gpuVram)); + return result; +} + Napi::Value AddonContext::GetThreads(const Napi::CallbackInfo& info) { if (disposed) { Napi::Error::New(info.Env(), "Context is disposed").ThrowAsJavaScriptException(); @@ -744,9 +788,8 @@ Napi::Value AddonContext::SetThreads(const Napi::CallbackInfo& info) { } const auto threads = info[0].As().Int32Value(); - const auto resolvedThreads = threads == 0 - ? std::max((int32_t)std::thread::hardware_concurrency(), std::max(cpu_get_num_math(), 1)) - : threads; + const auto resolvedThreads = + threads == 0 ? std::max((int32_t)std::thread::hardware_concurrency(), std::max(common_cpu_get_num_math(), 1)) : threads; if (llama_n_threads(ctx) != resolvedThreads) { llama_set_n_threads(ctx, resolvedThreads, resolvedThreads); @@ -1062,6 +1105,7 @@ void AddonContext::init(Napi::Object exports) { InstanceMethod("sampleToken", &AddonContext::SampleToken), InstanceMethod("getEmbedding", &AddonContext::GetEmbedding), InstanceMethod("getStateSize", &AddonContext::GetStateSize), + InstanceMethod("getMemoryBreakdown", &AddonContext::GetMemoryBreakdown), InstanceMethod("getThreads", &AddonContext::GetThreads), InstanceMethod("setThreads", &AddonContext::SetThreads), InstanceMethod("printTimings", &AddonContext::PrintTimings), diff --git a/llama/addon/AddonContext.h b/llama/addon/AddonContext.h index 2e677203..6955c658 100644 --- a/llama/addon/AddonContext.h +++ b/llama/addon/AddonContext.h @@ -46,6 +46,7 @@ class AddonContext : public Napi::ObjectWrap { Napi::Value GetEmbedding(const Napi::CallbackInfo& info); Napi::Value GetStateSize(const Napi::CallbackInfo& info); + Napi::Value GetMemoryBreakdown(const Napi::CallbackInfo& info); Napi::Value GetThreads(const Napi::CallbackInfo& info); Napi::Value SetThreads(const Napi::CallbackInfo& info); diff --git a/llama/addon/AddonGgufMetadata.cpp b/llama/addon/AddonGgufMetadata.cpp new file mode 100644 index 00000000..366ea924 --- /dev/null +++ b/llama/addon/AddonGgufMetadata.cpp @@ -0,0 +1,162 @@ +#include + +#include "AddonGgufMetadata.h" +#include "gguf.h" + + +AddonGgufMetadata::AddonGgufMetadata(const Napi::CallbackInfo& info) + : Napi::ObjectWrap(info), + ggufMetadata(gguf_init_empty()) { + if (ggufMetadata.get() == nullptr) { + throw std::runtime_error("Failed to create an empty GGUF context"); + } +} +AddonGgufMetadata::~AddonGgufMetadata() { + dispose(); +} + +void AddonGgufMetadata::dispose() { + if (disposed) { + return; + } + + disposed = true; + ggufMetadata.reset(); +} + +Napi::Value AddonGgufMetadata::Dispose(const Napi::CallbackInfo& info) { + dispose(); + return info.Env().Undefined(); +} + +class AddonGgufMetadataInitWorker : public Napi::AsyncWorker { + public: + AddonGgufMetadata* addonGgufMetadata; + std::vector sources; + std::vector>> bufferRefs; + + AddonGgufMetadataInitWorker(const Napi::Env& env, AddonGgufMetadata* addonGgufMetadata) + : Napi::AsyncWorker(env, "AddonGgufMetadataInitWorker"), + addonGgufMetadata(addonGgufMetadata), + deferred(Napi::Promise::Deferred::New(env)) { + addonGgufMetadata->Ref(); + } + ~AddonGgufMetadataInitWorker() { + addonGgufMetadata->Unref(); + } + + Napi::Promise GetPromise() { + return deferred.Promise(); + } + + protected: + Napi::Promise::Deferred deferred; + + void Execute() { + try { + gguf_context_ptr& ggufMetadata = addonGgufMetadata->ggufMetadata; + + bool hasCopiedMetadata = false; + for (const auto& itemSource : sources) { + struct ggml_context* tensorContext = nullptr; + struct gguf_init_params ggufParams = { + /* .no_alloc = */ true, + /* .ctx = */ &tensorContext, + }; + gguf_context_ptr metadata( + itemSource.type == AddonGgufMetadataSourceType::buffer + ? gguf_init_from_buffer(itemSource.buffer.data, itemSource.buffer.length, ggufParams) + : gguf_init_from_file(itemSource.path.c_str(), ggufParams) + ); + ggml_context_ptr tensorContextGuard(tensorContext); + + if (metadata.get() == nullptr || tensorContext == nullptr) { + throw std::runtime_error("Failed to parse GGUF metadata buffer"); + } + + if (!hasCopiedMetadata) { + gguf_set_kv(ggufMetadata.get(), metadata.get()); + hasCopiedMetadata = true; + } + + for (ggml_tensor* tensor = ggml_get_first_tensor(tensorContext); tensor != nullptr; + tensor = ggml_get_next_tensor(tensorContext, tensor)) { + gguf_add_tensor(ggufMetadata.get(), tensor); + } + } + } catch (const std::exception& e) { + SetError(e.what()); + } catch (...) { + SetError("Unknown error when loading GGUF metadata from the given sources"); + } + } + void OnOK() { + deferred.Resolve(Env().Undefined()); + } + void OnError(const Napi::Error& err) { + deferred.Reject(err.Value()); + } +}; + +Napi::Value AddonGgufMetadata::Init(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Metadata is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + std::vector metadataSources; + std::vector>> bufferRefs; + + if (info.Length() == 0 || !info[0].IsArray()) { + Napi::TypeError::New(info.Env(), "Expected an array of sources as the first argument").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + const auto sourceBufferValues = info[0].As(); + const uint32_t sourcesCount = sourceBufferValues.Length(); + + if (sourcesCount == 0) { + Napi::TypeError::New(info.Env(), "Expected source array to contain at least one item").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + metadataSources.reserve(sourcesCount); + bufferRefs.reserve(sourcesCount); + + for (uint32_t i = 0; i < sourcesCount; i++) { + const auto sourceBufferValue = sourceBufferValues.Get(i); + if (sourceBufferValue.IsBuffer()) { + const auto sourceBuffer = sourceBufferValue.As>(); + metadataSources.emplace_back(AddonGgufMetadataSource(AddonGgufMetadataSourceBuffer(sourceBuffer.Data(), sourceBuffer.Length()))); + bufferRefs.emplace_back(Napi::Persistent(sourceBuffer)); + } else if (sourceBufferValue.IsString()) { + const auto sourcePath = sourceBufferValue.As().Utf8Value(); + metadataSources.emplace_back(AddonGgufMetadataSource(sourcePath)); + } else { + Napi::TypeError::New(info.Env(), "Expected every source array item to be a Buffer or a string").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + } + + + AddonGgufMetadataInitWorker* worker = new AddonGgufMetadataInitWorker(info.Env(), this); + worker->sources.swap(metadataSources); + worker->bufferRefs.swap(bufferRefs); + + worker->Queue(); + return worker->GetPromise(); +} + +void AddonGgufMetadata::init(Napi::Object exports) { + exports.Set( + "AddonGgufMetadata", + DefineClass( + exports.Env(), + "AddonGgufMetadata", + { + InstanceMethod("init", &AddonGgufMetadata::Init), + InstanceMethod("dispose", &AddonGgufMetadata::Dispose), + } + ) + ); +} \ No newline at end of file diff --git a/llama/addon/AddonGgufMetadata.h b/llama/addon/AddonGgufMetadata.h new file mode 100644 index 00000000..11521543 --- /dev/null +++ b/llama/addon/AddonGgufMetadata.h @@ -0,0 +1,48 @@ +#pragma once + +#include + +#include "ggml-cpp.h" +#include "napi.h" + + +enum class AddonGgufMetadataSourceType { + path = 0, + buffer = 1 +}; + +struct AddonGgufMetadataSourceBuffer { + const uint8_t* data = nullptr; + std::size_t length = 0; + + AddonGgufMetadataSourceBuffer() = default; + AddonGgufMetadataSourceBuffer(const uint8_t* data, std::size_t length) : data(data), length(length) { + } +}; + +struct AddonGgufMetadataSource { + AddonGgufMetadataSourceType type = AddonGgufMetadataSourceType::path; + std::string path; + AddonGgufMetadataSourceBuffer buffer; + + AddonGgufMetadataSource() = default; + explicit AddonGgufMetadataSource(std::string path) : type(AddonGgufMetadataSourceType::path), path(std::move(path)) { + } + explicit AddonGgufMetadataSource(AddonGgufMetadataSourceBuffer buffer) : type(AddonGgufMetadataSourceType::buffer), buffer(buffer) { + } +}; + +class AddonGgufMetadata : public Napi::ObjectWrap { + public: + gguf_context_ptr ggufMetadata; + bool disposed = false; + + AddonGgufMetadata(const Napi::CallbackInfo& info); + ~AddonGgufMetadata(); + void dispose(); + + Napi::Value Init(const Napi::CallbackInfo& info); + Napi::Value Dispose(const Napi::CallbackInfo& info); + + static void init(Napi::Object exports); +}; diff --git a/llama/addon/AddonModel.cpp b/llama/addon/AddonModel.cpp index 94b4e576..b7e938fe 100644 --- a/llama/addon/AddonModel.cpp +++ b/llama/addon/AddonModel.cpp @@ -4,9 +4,12 @@ #include "globals/addonProgress.h" #include "common/common.h" #include "llama.h" +#include "llama-model.h" +#include "gguf.h" #include "AddonModel.h" #include "AddonModelData.h" #include "AddonModelLora.h" +#include "AddonGgufMetadata.h" static Napi::Value getNapiToken(const Napi::CallbackInfo& info, const llama_vocab* vocab, llama_token token) { if (token < 0 || token == LLAMA_TOKEN_NULL) { @@ -69,18 +72,37 @@ static bool llamaModelParamsProgressCallback(float progress, void * user_data) { return !(addonModel->abortModelLoad); } +struct ModelEstimatorTensorAccessState { + bool accessedTensorData = false; +}; + +static void markUnexpectedTensorDataAccess(struct ggml_tensor * /* tensor */, void * userData) { + auto * tensorAccessState = static_cast(userData); + if (tensorAccessState != nullptr) { + tensorAccessState->accessedTensorData = true; + } +} + class AddonModelLoadModelWorker : public Napi::AsyncWorker { public: AddonModel* model; + AddonGgufMetadata* ggufMetadata = nullptr; - AddonModelLoadModelWorker(const Napi::Env& env, AddonModel* model) + AddonModelLoadModelWorker(const Napi::Env& env, AddonModel* model, AddonGgufMetadata* ggufMetadata) : Napi::AsyncWorker(env, "AddonModelLoadModelWorker"), model(model), + ggufMetadata(ggufMetadata), deferred(Napi::Promise::Deferred::New(env)) { model->Ref(); + if (ggufMetadata != nullptr) { + ggufMetadata->Ref(); + } } ~AddonModelLoadModelWorker() { model->Unref(); + if (ggufMetadata != nullptr) { + ggufMetadata->Unref(); + } } Napi::Promise GetPromise() { @@ -92,10 +114,41 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker { void Execute() { try { - model->model = llama_model_load_from_file(model->modelPath.c_str(), model->model_params); - model->vocab = llama_model_get_vocab(model->model); + if (model->modelPath != "" && ggufMetadata == nullptr) { + model->model = llama_model_load_from_file(model->modelPath.c_str(), model->model_params); + } else { + if (!model->model_params.no_alloc) { + throw std::runtime_error("Loading a model from source buffers requires no_alloc=true"); + } else if (ggufMetadata->disposed || ggufMetadata->ggufMetadata.get() == nullptr) { + throw std::runtime_error("GGUF metadata is disposed"); + } + + ModelEstimatorTensorAccessState tensorAccessState; + model->model = llama_model_init_from_user( + ggufMetadata->ggufMetadata.get(), + markUnexpectedTensorDataAccess, + &tensorAccessState, + model->model_params + ); + + if (tensorAccessState.accessedTensorData) { + if (model->model != nullptr) { + llama_model_free(model->model); + model->model = nullptr; + } - model->modelLoaded = model->model != nullptr && model->model != NULL; + throw std::runtime_error( + "Unexpected tensor data access when loading a model from source buffers with no_alloc=true" + ); + } + } + + if (model->model != nullptr) { + model->vocab = llama_model_get_vocab(model->model); + model->modelLoaded = true; + } else { + model->modelLoaded = false; + } } catch (const std::exception& e) { SetError(e.what()); } catch(...) { @@ -103,7 +156,7 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker { } } void OnOK() { - if (model->modelLoaded) { + if (model->modelLoaded && !model->model_params.no_alloc) { uint64_t modelSize = llama_model_size(model->model); adjustNapiExternalMemoryAdd(Env(), modelSize); model->loadedModelSize = modelSize; @@ -116,6 +169,9 @@ class AddonModelLoadModelWorker : public Napi::AsyncWorker { } void OnError(const Napi::Error& err) { deferred.Reject(err.Value()); + if (model->onLoadProgressEventCallbackSet) { + model->addonThreadSafeOnLoadProgressEventCallback.Release(); + } } }; @@ -153,8 +209,10 @@ class AddonModelUnloadModelWorker : public Napi::AsyncWorker { } } void OnOK() { - adjustNapiExternalMemorySubtract(Env(), model->loadedModelSize); - model->loadedModelSize = 0; + if (!model->model_params.no_alloc) { + adjustNapiExternalMemorySubtract(Env(), model->loadedModelSize); + model->loadedModelSize = 0; + } deferred.Resolve(Env().Undefined()); } @@ -225,11 +283,11 @@ class AddonModelLoadLoraWorker : public Napi::AsyncWorker { } }; -AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap(info) { +AddonModel::AddonModel(const Napi::CallbackInfo& info) : + Napi::ObjectWrap(info) { data = new AddonModelData(); model_params = llama_model_default_params(); - // Get the model path modelPath = info[0].As().Utf8Value(); if (info.Length() > 1 && info[1].IsObject()) { @@ -264,6 +322,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value(); } + if (options.Has("noAlloc")) { + model_params.no_alloc = options.Get("noAlloc").As().Value(); + } + if (options.Has("onLoadProgress")) { auto onLoadProgressJSCallback = options.Get("onLoadProgress").As(); if (onLoadProgressJSCallback.IsFunction()) { @@ -351,6 +413,11 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrapEnv(), this); + AddonGgufMetadata* ggufMetadata = nullptr; + if (info.Length() > 0 && !info[0].IsUndefined()) { + ggufMetadata = Napi::ObjectWrap::Unwrap(info[0].As()); + if (ggufMetadata == nullptr || ggufMetadata->ggufMetadata.get() == nullptr) { + Napi::TypeError::New(info.Env(), "Invalid GGUF metadata object").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + } + + AddonModelLoadModelWorker* worker = new AddonModelLoadModelWorker(this->Env(), this, ggufMetadata); worker->Queue(); return worker->GetPromise(); } @@ -515,6 +593,48 @@ Napi::Value AddonModel::GetModelDescription(const Napi::CallbackInfo& info) { return Napi::String::New(info.Env(), model_desc, actual_length); } +Napi::Value AddonModel::GetMemoryBreakdown(const Napi::CallbackInfo& info) { + if (disposed) { + Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + if (!modelLoaded || model == nullptr) { + Napi::Error::New(info.Env(), "Model is not loaded").ThrowAsJavaScriptException(); + return info.Env().Undefined(); + } + + std::size_t cpuRam = 0; + std::size_t gpuVram = 0; + + for (const auto& [bufferType, size] : model->memory_breakdown()) { + if (size == 0) { + continue; + } + + if (ggml_backend_buft_is_host(bufferType)) { + cpuRam += size; + } else { + ggml_backend_dev_t device = ggml_backend_buft_get_device(bufferType); + if (device != nullptr) { + auto deviceType = ggml_backend_dev_type(device); + if (deviceType == GGML_BACKEND_DEVICE_TYPE_GPU || deviceType == GGML_BACKEND_DEVICE_TYPE_IGPU) { + gpuVram += size; + } else { + cpuRam += size; + } + } else { + cpuRam += size; + } + } + } + + Napi::Object result = Napi::Object::New(info.Env()); + result.Set("cpuRam", Napi::Number::New(info.Env(), cpuRam)); + result.Set("gpuVram", Napi::Number::New(info.Env(), gpuVram)); + return result; +} + Napi::Value AddonModel::TokenBos(const Napi::CallbackInfo& info) { if (disposed) { Napi::Error::New(info.Env(), "Model is disposed").ThrowAsJavaScriptException(); @@ -669,6 +789,7 @@ void AddonModel::init(Napi::Object exports) { InstanceMethod("getTotalSize", &AddonModel::GetTotalSize), InstanceMethod("getTotalParameters", &AddonModel::GetTotalParameters), InstanceMethod("getModelDescription", &AddonModel::GetModelDescription), + InstanceMethod("getMemoryBreakdown", &AddonModel::GetMemoryBreakdown), InstanceMethod("tokenBos", &AddonModel::TokenBos), InstanceMethod("tokenEos", &AddonModel::TokenEos), InstanceMethod("tokenNl", &AddonModel::TokenNl), diff --git a/llama/addon/AddonModel.h b/llama/addon/AddonModel.h index b6661d35..25740489 100644 --- a/llama/addon/AddonModel.h +++ b/llama/addon/AddonModel.h @@ -1,9 +1,16 @@ #pragma once + +#include +#include +#include +#include + #include "llama.h" #include "napi.h" #include "addonGlobals.h" #include "globals/addonProgress.h" + class AddonModel : public Napi::ObjectWrap { public: llama_model_params model_params; @@ -42,6 +49,7 @@ class AddonModel : public Napi::ObjectWrap { Napi::Value GetTotalSize(const Napi::CallbackInfo& info); Napi::Value GetTotalParameters(const Napi::CallbackInfo& info); Napi::Value GetModelDescription(const Napi::CallbackInfo& info); + Napi::Value GetMemoryBreakdown(const Napi::CallbackInfo& info); Napi::Value TokenBos(const Napi::CallbackInfo& info); Napi::Value TokenEos(const Napi::CallbackInfo& info); diff --git a/llama/addon/AddonSampler.cpp b/llama/addon/AddonSampler.cpp index cd65d066..9e706642 100644 --- a/llama/addon/AddonSampler.cpp +++ b/llama/addon/AddonSampler.cpp @@ -122,10 +122,6 @@ void AddonSampler::rebuildChainIfNeeded() { llama_sampler_chain_add(chain, dryRepeatPenaltySampler); } - if (grammarEvaluationState != nullptr) { - llama_sampler_chain_add(chain, grammarEvaluationState->sampler); - } - if (greedySampler != nullptr) { if (xtcSampler != nullptr) { llama_sampler_chain_add(chain, xtcSampler); @@ -174,6 +170,96 @@ void AddonSampler::acceptToken(llama_token token) { } } +void AddonSampler::sample(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP, bool forceGrammar) { + setTokenCandidates(llamaContext, batchLogitIndex, curP); + + if (forceGrammar && grammarEvaluationState != nullptr && grammarEvaluationState->sampler != nullptr) { + llama_sampler_apply(grammarEvaluationState->sampler, &curP); + llama_sampler_apply(chain, &curP); + return; + } + + if (grammarEvaluationState == nullptr || grammarEvaluationState->sampler == nullptr) { + llama_sampler_apply(chain, &curP); + return; + } + + // test whether the sampled token would be accepted by the grammar, + // and otherwise apply the grammar first and then the rest of the chain + { + llama_sampler_apply(chain, &curP); + if (!(curP.selected >= 0 && curP.selected < (int32_t)curP.size)) { + return; + } + + llama_token_data singleTokenData = { curP.data[curP.selected].id, 1.0f, 0.0f }; + llama_token_data_array singleTokenDataArray = { &singleTokenData, 1, -1, false }; + + llama_sampler_apply(grammarEvaluationState->sampler, &singleTokenDataArray); + + const bool isValid = singleTokenData.logit != -INFINITY; + if (isValid) { + return; + } + + setTokenCandidates(llamaContext, batchLogitIndex, curP); + + llama_sampler_apply(grammarEvaluationState->sampler, &curP); + llama_sampler_apply(chain, &curP); + } +} + +void AddonSampler::setTokenCandidates(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP) { + const float* sampledProbs = llama_get_sampled_probs_ith(llamaContext, batchLogitIndex); + const float* sampledLogits = llama_get_sampled_logits_ith(llamaContext, batchLogitIndex); + const llama_token* sampledIds = llama_get_sampled_candidates_ith(llamaContext, batchLogitIndex); + + const llama_model* model = llama_get_model(llamaContext); + const llama_vocab* vocab = llama_model_get_vocab(model); + + if (sampledProbs != nullptr) { + const uint32_t sampledProbsSize = llama_get_sampled_probs_count_ith(llamaContext, batchLogitIndex); + curP.size = sampledProbsSize; + + if (tokenCandidates.size() < sampledProbsSize) { + tokenCandidates.resize(sampledProbsSize); + } + + for (uint32_t i = 0; i < sampledProbsSize; i++) { + tokenCandidates[i] = llama_token_data { sampledIds[i], sampledLogits[i], sampledProbs[i] }; + } + } else if (sampledLogits != nullptr) { + const uint32_t sampledLogitsSize = llama_get_sampled_logits_count_ith(llamaContext, batchLogitIndex); + curP.size = sampledLogitsSize; + if (tokenCandidates.size() < sampledLogitsSize) { + tokenCandidates.resize(sampledLogitsSize); + } + + for (uint32_t i = 0; i < sampledLogitsSize; i++) { + tokenCandidates[i] = llama_token_data { sampledIds[i], sampledLogits[i], 0.0f }; + } + } else { + const auto* logits = llama_get_logits_ith(llamaContext, batchLogitIndex); + if (logits != nullptr) { + const auto vocabLength = llama_vocab_n_tokens(vocab); + curP.size = vocabLength; + if (tokenCandidates.size() < vocabLength) { + tokenCandidates.resize(vocabLength); + } + + for (llama_token tokenId = 0; tokenId < vocabLength; tokenId++) { + tokenCandidates[tokenId] = llama_token_data { tokenId, logits[tokenId], 0.0f }; + } + } else { + curP.size = 0; + } + } + + curP.data = tokenCandidates.data(); + curP.selected = -1; + curP.sorted = false; +} + Napi::Value AddonSampler::Dispose(const Napi::CallbackInfo& info) { dispose(); return info.Env().Undefined(); @@ -584,8 +670,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { Napi::ObjectWrap::Unwrap(config.Get("grammarEvaluationState").As()); if (grammarEvaluationState != configGrammarEvaluationState) { - freeChain(); - if (grammarEvaluationState != nullptr) { grammarEvaluationState->Unref(); grammarEvaluationState = nullptr; @@ -595,7 +679,6 @@ Napi::Value AddonSampler::ApplyConfig(const Napi::CallbackInfo& info) { grammarEvaluationState->Ref(); } } else if (grammarEvaluationState != nullptr) { - freeChain(); grammarEvaluationState->Unref(); grammarEvaluationState = nullptr; } diff --git a/llama/addon/AddonSampler.h b/llama/addon/AddonSampler.h index 3218b11d..77735d04 100644 --- a/llama/addon/AddonSampler.h +++ b/llama/addon/AddonSampler.h @@ -64,6 +64,8 @@ class AddonSampler : public Napi::ObjectWrap { void freeChain(); void rebuildChainIfNeeded(); void acceptToken(llama_token token); + void sample(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP, bool forceGrammar); + void setTokenCandidates(struct llama_context* llamaContext, int32_t batchLogitIndex, llama_token_data_array& curP); Napi::Value Dispose(const Napi::CallbackInfo& info); Napi::Value ApplyConfig(const Napi::CallbackInfo& info); diff --git a/llama/addon/addon.cpp b/llama/addon/addon.cpp index eb1f35bf..b7642629 100644 --- a/llama/addon/addon.cpp +++ b/llama/addon/addon.cpp @@ -1,4 +1,5 @@ #include "addonGlobals.h" +#include "AddonGgufMetadata.h" #include "AddonModel.h" #include "AddonModelLora.h" #include "AddonGrammar.h" @@ -15,7 +16,7 @@ #include #include -std::mutex backendMutex; + std::mutex backendMutex; bool backendInitialized = false; bool backendDisposed = false; @@ -51,7 +52,7 @@ Napi::Value addonGetSupportsMlock(const Napi::CallbackInfo& info) { } Napi::Value addonGetMathCores(const Napi::CallbackInfo& info) { - return Napi::Number::New(info.Env(), cpu_get_num_math()); + return Napi::Number::New(info.Env(), common_cpu_get_num_math()); } Napi::Value addonGetBlockSizeForGgmlType(const Napi::CallbackInfo& info) { @@ -236,7 +237,7 @@ Napi::Value addonSetNuma(const Napi::CallbackInfo& info) { } Napi::Value markLoaded(const Napi::CallbackInfo& info) { - static std::atomic_bool loaded = false; + static std::atomic_bool loaded(false); return Napi::Boolean::New(info.Env(), loaded.exchange(true)); } @@ -312,6 +313,7 @@ Napi::Object registerCallback(Napi::Env env, Napi::Object exports) { Napi::PropertyDescriptor::Function("init", addonInit), Napi::PropertyDescriptor::Function("dispose", addonDispose), }); + AddonGgufMetadata::init(exports); AddonModel::init(exports); AddonModelLora::init(exports); AddonGrammar::init(exports); diff --git a/llama/addon/addonGlobals.h b/llama/addon/addonGlobals.h index 1a4dd8d1..5edffa37 100644 --- a/llama/addon/addonGlobals.h +++ b/llama/addon/addonGlobals.h @@ -1,6 +1,7 @@ #pragma once #include "napi.h" +class AddonGgufMetadata; class AddonModel; class AddonModelLora; class AddonModelData; diff --git a/llama/addon/globals/getGpuInfo.cpp b/llama/addon/globals/getGpuInfo.cpp index 34c0a94a..a655b867 100644 --- a/llama/addon/globals/getGpuInfo.cpp +++ b/llama/addon/globals/getGpuInfo.cpp @@ -121,6 +121,22 @@ std::pair getGpuDevice() { } Napi::Value getGpuType(const Napi::CallbackInfo& info) { + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + ggml_backend_reg_t backend = ggml_backend_reg_get(i); + const auto backendName = std::string(ggml_backend_reg_name(backend)); + + if (backendName == "MTL" || backendName == "Metal") { + return Napi::String::New(info.Env(), "metal"); + } else if (backendName == "Vulkan") { + return Napi::String::New(info.Env(), "vulkan"); + } + + // else if ( + // backendName == "CUDA" || backendName == "ROCm" || backendName == "MUSA") { + // return Napi::String::New(info.Env(), "cuda"); + // } + } + const auto gpuDeviceRes = getGpuDevice(); const auto device = gpuDeviceRes.first; const auto deviceType = gpuDeviceRes.second; diff --git a/llama/cmake/addVariantSuffix.cmake b/llama/cmake/addVariantSuffix.cmake index 064c5b62..53b5e881 100644 --- a/llama/cmake/addVariantSuffix.cmake +++ b/llama/cmake/addVariantSuffix.cmake @@ -1,5 +1,5 @@ function(addVariantSuffix originalTarget variantSuffix) - if (NOT TARGET ${originalTarget} OR ${variantSuffix} STREQUAL "") + if (NOT TARGET ${originalTarget} OR "${variantSuffix}" STREQUAL "") return() endif() diff --git a/llama/patches/PR-22341.diff b/llama/patches/PR-22341.diff new file mode 100644 index 00000000..33956297 --- /dev/null +++ b/llama/patches/PR-22341.diff @@ -0,0 +1,271 @@ +diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h +index 02d5f221c03..387f2a07d9c 100644 +--- a/ggml/include/gguf.h ++++ b/ggml/include/gguf.h +@@ -76,10 +76,16 @@ extern "C" { + struct ggml_context ** ctx; + }; + ++ // callback to simulate or wrap a FILE pointer: ++ // - by default, read up to `len` bytes at `offset` into `output` and return the number of bytes read ++ // - if called with `len == 0`, seek/synchronize to `offset` without reading, return 0 on success, non-zero for failure ++ typedef size_t (*gguf_reader_callback_t)(void * userdata, void * output, uint64_t offset, size_t len); ++ + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); +- //GGML_API struct gguf_context * gguf_init_from_buffer(..); ++ GGML_API struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params); ++ GGML_API struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params); + + GGML_API void gguf_free(struct gguf_context * ctx); + +diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp +index ab3cc974867..be35bde9a4c 100644 +--- a/ggml/src/gguf.cpp ++++ b/ggml/src/gguf.cpp +@@ -228,9 +228,17 @@ struct gguf_context { + }; + + struct gguf_reader { +- gguf_reader(FILE * file) : file(file) { +- // read the remaining bytes once and update on each read +- nbytes_remain = file_remain(file); ++ gguf_reader( ++ gguf_reader_callback_t callback, ++ void * userdata, ++ size_t max_chunk_read, ++ uint64_t data_offset = 0, ++ uint64_t nbytes_remain = 0) ++ : callback(callback), ++ userdata(userdata), ++ max_chunk_read(max_chunk_read), ++ data_offset(data_offset), ++ nbytes_remain(nbytes_remain) { + } + + // helper for remaining bytes in a file +@@ -257,12 +265,10 @@ struct gguf_reader { + template + bool read(T & dst) const { + const size_t size = sizeof(dst); +- if (nbytes_remain < size) { ++ if (size > nbytes_remain) { + return false; + } +- const size_t nread = fread(&dst, 1, size, file); +- nbytes_remain -= nread; +- return nread == size; ++ return read_raw(&dst, size) == size; + } + + template +@@ -344,24 +350,75 @@ struct gguf_reader { + return false; + } + dst.resize(static_cast(size)); +- const size_t nread = fread(dst.data(), 1, size, file); +- nbytes_remain -= nread; +- return nread == size; ++ return read_raw(dst.data(), static_cast(size)) == size; + } + + bool read(void * dst, const size_t size) const { + if (size > nbytes_remain) { + return false; + } +- const size_t nread = fread(dst, 1, size, file); +- nbytes_remain -= nread; +- return nread == size; ++ return read_raw(dst, size) == size; ++ } ++ ++ uint64_t tell() const { ++ return data_offset; ++ } ++ ++ bool seek(uint64_t absolute_offset) const { ++ const uint64_t end_offset = uint64_t(data_offset) + nbytes_remain; ++ if (absolute_offset > end_offset) { ++ return false; ++ } ++ ++ if (absolute_offset != data_offset && callback(userdata, nullptr, absolute_offset, 0) != 0) { ++ return false; ++ } ++ ++ data_offset = absolute_offset; ++ nbytes_remain = end_offset - absolute_offset; ++ ++ return true; + } + + private: +- FILE * file; ++ size_t read_raw(void * dst, size_t size) const { ++ if (callback == nullptr || size == 0) { ++ return 0; ++ } ++ ++ uint8_t * data = static_cast(dst); ++ size_t total_nread = 0; ++ bool reached_eof = false; + +- mutable uint64_t nbytes_remain; ++ while (total_nread < size) { ++ const size_t chunk = std::min(max_chunk_read, size - total_nread); ++ if (data_offset + total_nread < data_offset) { ++ break; ++ } ++ const size_t nread = callback(userdata, static_cast(data + total_nread), data_offset + total_nread, chunk); ++ total_nread += nread; ++ if (nread != chunk) { ++ reached_eof = true; ++ break; ++ } ++ } ++ ++ data_offset += total_nread; ++ GGML_ASSERT(total_nread <= nbytes_remain); ++ nbytes_remain -= total_nread; ++ ++ if (reached_eof) { ++ nbytes_remain = 0; ++ } ++ ++ return total_nread; ++ } ++ ++ gguf_reader_callback_t callback = nullptr; ++ void * userdata = nullptr; ++ size_t max_chunk_read = 0; ++ mutable uint64_t data_offset = 0; ++ mutable uint64_t nbytes_remain = 0; + }; + + struct gguf_context * gguf_init_empty(void) { +@@ -394,12 +451,7 @@ bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vectorinfo.size()) == n_tensors); + + // we require the data section to be aligned, so take into account any padding +- if (gguf_fseek(file, GGML_PAD(gguf_ftell(file), ctx->alignment), SEEK_SET) != 0) { ++ if (!gr.seek(GGML_PAD(gr.tell(), ctx->alignment))) { + GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); + gguf_free(ctx); + return nullptr; + } + + // store the current file offset - this is where the data section starts +- ctx->offset = gguf_ftell(file); ++ ctx->offset = gr.tell(); + + // compute the total size of the data section, taking into account the alignment + { +@@ -844,6 +896,93 @@ struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_para + return ctx; + } + ++struct gguf_context * gguf_init_from_callback(gguf_reader_callback_t callback, void * userdata, size_t max_chunk_read, uint64_t max_expected_size, struct gguf_init_params params) { ++ if (callback == nullptr || max_chunk_read == 0) { ++ return nullptr; ++ } ++ ++ const struct gguf_reader gr(callback, userdata, max_chunk_read, 0, max_expected_size); ++ return gguf_init_from_reader(gr, params); ++} ++ ++struct gguf_file_reader { ++ FILE * file; ++ uint64_t offset; ++}; ++ ++static size_t gguf_file_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) { ++ gguf_file_reader & reader = *static_cast(userdata); ++ ++ if (reader.offset != offset) { ++ if (offset > INT64_MAX || gguf_fseek(reader.file, static_cast(offset), SEEK_SET) != 0) { ++ return len == 0 ? 1 : 0; ++ } ++ ++ reader.offset = offset; ++ } ++ ++ if (len == 0) { ++ return 0; ++ } ++ ++ const size_t nread = fread(static_cast(output), 1, len, reader.file); ++ reader.offset += nread; ++ return nread; ++} ++ ++struct gguf_context * gguf_init_from_file_ptr(FILE * file, struct gguf_init_params params) { ++ if (!file) { ++ return nullptr; ++ } ++ ++ const int64_t cur = gguf_ftell(file); ++ if (cur < 0) { ++ return nullptr; ++ } ++ ++ gguf_file_reader reader = { ++ /*.file = */ file, ++ /*.offset = */ static_cast(cur), ++ }; ++ const struct gguf_reader gr(gguf_file_reader_callback, &reader, SIZE_MAX, reader.offset, gguf_reader::file_remain(file)); ++ return gguf_init_from_reader(gr, params); ++} ++ ++struct gguf_buffer_reader { ++ const uint8_t * data; ++ size_t size; ++}; ++ ++static size_t gguf_buffer_reader_callback(void * userdata, void * output, uint64_t offset, size_t len) { ++ const gguf_buffer_reader & reader = *static_cast(userdata); ++ ++ if (offset > reader.size) { ++ return len == 0 ? 1 : 0; ++ } ++ ++ if (len == 0 || offset + len > reader.size) { ++ return 0; ++ } ++ ++ const size_t data_offset = static_cast(offset); ++ const size_t nread = std::min(len, reader.size - data_offset); ++ memcpy(static_cast(output), reader.data + data_offset, nread); ++ return nread; ++} ++ ++struct gguf_context * gguf_init_from_buffer(const void * data, size_t size, struct gguf_init_params params) { ++ if (data == nullptr || size == 0) { ++ return nullptr; ++ } ++ ++ gguf_buffer_reader reader = { ++ /*.data = */ static_cast(data), ++ /*.size = */ size, ++ }; ++ const struct gguf_reader gr(gguf_buffer_reader_callback, &reader, SIZE_MAX, 0, size); ++ return gguf_init_from_reader(gr, params); ++} ++ + struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + FILE * file = ggml_fopen(fname, "rb"); + diff --git a/llama/patches/PR-22566.diff b/llama/patches/PR-22566.diff new file mode 100644 index 00000000..af37cf0d --- /dev/null +++ b/llama/patches/PR-22566.diff @@ -0,0 +1,45 @@ +diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp +index 4e65a45a50d..2de7e7bcd1f 100644 +--- a/src/llama-model-loader.cpp ++++ b/src/llama-model-loader.cpp +@@ -697,7 +697,9 @@ llama_model_loader::llama_model_loader( + } + + n_kv = gguf_get_n_kv(metadata); +- n_tensors = weights_map.size(); ++ n_tensors = files.empty() ++ ? gguf_get_n_tensors(metadata) ++ : weights_map.size(); + + fver = (enum llama_fver) gguf_get_version(metadata); + +@@ -1218,6 +1220,12 @@ struct ggml_tensor * llama_model_loader::create_tensor( + const int64_t tid = gguf_find_tensor(metadata, tn.str().c_str()); + if (tid != -1) { + type = gguf_get_tensor_type(metadata, tid); ++ } else if (no_alloc) { ++ if (flags & TENSOR_NOT_REQUIRED) { ++ return nullptr; ++ } ++ ++ throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str())); + } + + // for tensors that are not required some of the dimensions can be invalid: +@@ -1243,6 +1251,16 @@ struct ggml_tensor * llama_model_loader::create_tensor( + ggml_backend_buffer_type_t buft = buft_for_tensor(&t_meta); + GGML_ASSERT(buft != nullptr); + ggml_context * ctx = ctx_for_buft(buft); ++ ++ if (flags & TENSOR_DUPLICATED) { ++ ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str()); ++ if (t) { ++ return t; ++ } ++ } else { ++ n_created++; ++ } ++ + ggml_tensor * ret = ggml_dup_tensor(ctx, &t_meta); + ggml_set_name(ret, tn.str().c_str()); + return ret; diff --git a/llama/patches/PR-22742.diff b/llama/patches/PR-22742.diff new file mode 100644 index 00000000..a7220d54 --- /dev/null +++ b/llama/patches/PR-22742.diff @@ -0,0 +1,11 @@ +diff --git a/src/llama-model.cpp b/src/llama-model.cpp +index 9a5802e3242..c14256389d8 100644 +--- a/src/llama-model.cpp ++++ b/src/llama-model.cpp +@@ -287,5 +287,5 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params + default: +- GGML_ABORT("unimplemented model class"); ++ throw std::runtime_error(std::string("unsupported model architecture: ") + llm_arch_name(arch)); + } + + } diff --git a/src/bindings/AddonTypes.ts b/src/bindings/AddonTypes.ts index 294749d5..70f9a22c 100644 --- a/src/bindings/AddonTypes.ts +++ b/src/bindings/AddonTypes.ts @@ -1,40 +1,48 @@ import {Token} from "../types.js"; import {LlamaNuma} from "./types.js"; +export type AddonModelParams = { + gpuLayers?: number, + vocabOnly?: boolean, + noAlloc?: boolean, + useMmap?: boolean, + useDirectIo?: boolean, + useMlock?: boolean, + checkTensors?: boolean, + overridesList?: Array<[key: string, value: number | bigint | boolean | string, type: 0 | 1 | undefined]> +}; + +export type AddonContextParams = { + contextSize?: number, + batchSize?: number, + sequences?: number, + flashAttention?: boolean | "auto", + logitsAll?: boolean, + embeddings?: boolean, + ranking?: boolean, + threads?: number, + performanceTracking?: boolean, + kvCacheKeyType?: number, + kvCacheValueType?: number, + swaFullCache?: boolean +}; export type BindingModule = { + AddonGgufMetadata: { + new (): AddonGgufMetadata + }, AddonModel: { - new (modelPath: string, params: { + new (modelPath: string, params: AddonModelParams & { addonExports?: BindingModule, - gpuLayers?: number, - vocabOnly?: boolean, - useMmap?: boolean, - useDirectIo?: boolean, - useMlock?: boolean, - checkTensors?: boolean, onLoadProgress?(loadPercentage: number): void, - hasLoadAbortSignal?: boolean, - overridesList?: Array<[key: string, value: number | bigint | boolean | string, type: 0 | 1 | undefined]> + hasLoadAbortSignal?: boolean }): AddonModel }, AddonModelLora: { new (model: AddonModel, filePath: string): AddonModelLora }, AddonContext: { - new (model: AddonModel, params: { - contextSize?: number, - batchSize?: number, - sequences?: number, - flashAttention?: boolean, - logitsAll?: boolean, - embeddings?: boolean, - ranking?: boolean, - threads?: number, - performanceTracking?: boolean, - kvCacheKeyType?: number, - kvCacheValueType?: number, - swaFullCache?: boolean - }): AddonContext + new (model: AddonModel, params: AddonContextParams): AddonContext }, AddonContextSequenceCheckpoint: { new (): AddonContextSequenceCheckpoint @@ -98,8 +106,13 @@ export type BindingModule = { dispose(): Promise }; +export type AddonGgufMetadata = { + init(source: Array): Promise, + dispose(): Promise +}; + export type AddonModel = { - init(): Promise, + init(source?: AddonGgufMetadata): Promise, loadLora(lora: AddonModelLora): Promise, abortActiveModelLoad(): void, dispose(): Promise, @@ -110,6 +123,10 @@ export type AddonModel = { getTotalSize(): number, getTotalParameters(): number, getModelDescription(): ModelTypeDescription, + getMemoryBreakdown(): { + cpuRam: number, + gpuVram: number + }, tokenBos(): Token, tokenEos(): Token, tokenNl(): Token, @@ -158,6 +175,10 @@ export type AddonContext = { getSequenceKvCacheMaxPosition(sequenceId: number): number, getEmbedding(inputTokensLength: number, maxVectorSize?: number): Float64Array, getStateSize(): number, + getMemoryBreakdown(): { + cpuRam: number, + gpuVram: number + }, getThreads(): number, setThreads(threads: number): void, printTimings(): void, diff --git a/src/bindings/Llama.ts b/src/bindings/Llama.ts index eee3ada6..4382f78f 100644 --- a/src/bindings/Llama.ts +++ b/src/bindings/Llama.ts @@ -41,9 +41,7 @@ export class Llama { /** @internal */ public readonly _memoryLock = {}; /** @internal */ public readonly _consts: ReturnType; /** @internal */ public readonly _vramOrchestrator: MemoryOrchestrator; - /** @internal */ public _vramPadding: MemoryReservation; /** @internal */ public readonly _ramOrchestrator: MemoryOrchestrator; - /** @internal */ public readonly _ramPadding: MemoryReservation; /** @internal */ public readonly _swapOrchestrator: MemoryOrchestrator; /** @internal */ public readonly _debug: boolean; /** @internal */ public readonly _threadsSplitter: ThreadsSplitter; @@ -78,7 +76,7 @@ export class Llama { private constructor({ bindings, bindingPath, extBackendsPath, logLevel, logger, buildType, cmakeOptions, llamaCppRelease, debug, tempDir, numa, buildGpu, - maxThreads, vramOrchestrator, vramPadding, ramOrchestrator, ramPadding, swapOrchestrator, skipLlamaInit + maxThreads, vramOrchestrator, ramOrchestrator, swapOrchestrator, skipLlamaInit }: { bindings: BindingModule, bindingPath: string, @@ -97,9 +95,7 @@ export class Llama { buildGpu: BuildGpu, maxThreads?: number, vramOrchestrator: MemoryOrchestrator, - vramPadding: MemoryReservation, ramOrchestrator: MemoryOrchestrator, - ramPadding: MemoryReservation, swapOrchestrator: MemoryOrchestrator, skipLlamaInit: boolean }) { @@ -148,9 +144,7 @@ export class Llama { this._mathCores = Math.floor(bindings.getMathCores()); this._consts = bindings.getConsts(); this._vramOrchestrator = vramOrchestrator; - this._vramPadding = vramPadding; this._ramOrchestrator = ramOrchestrator; - this._ramPadding = ramPadding; this._swapOrchestrator = swapOrchestrator; this._threadsSplitter = new ThreadsSplitter( maxThreads ?? ( @@ -299,7 +293,17 @@ export class Llama { * See `vramPadding` on `getLlama` for more information. */ public get vramPaddingSize() { - return this._vramPadding.size; + return this._vramOrchestrator.padding; + } + + /** + * RAM padding used for memory size calculations, as these calculations are not always accurate. + * This is set by default to ensure stability, but can be configured when you call `getLlama`. + * + * See `ramPadding` on `getLlama` for more information. + */ + public get ramPaddingSize() { + return this._ramOrchestrator.padding; } /** @@ -359,6 +363,85 @@ export class Llama { }; } + /** + * Get the total memory usage of this Llama instance + */ + public async getLlamaMemoryUsage() { + return { + gpuVram: this._vramOrchestrator.markedMemory, + cpuRam: this._ramOrchestrator.markedMemory + }; + } + + /** + * Cap the amount of VRAM that this Llama instance is allowed to use in bytes. + * This is useful for constraining the resource usage of models and contexts created with the Llama instance. + * + * Capping to a value that's too low may cause model loads and context creations to either fail or not fully offload to VRAM, + * causing inference to be significantly slower. + * + * Setting a cap will only affect future model loads and context creations. + * + * Use with caution. + * Setting to `null` disables the cap. + * + * Defaults to `null`. + */ + public async setVramCap(bytes: number | null) { + this._ensureNotDisposed(); + if (bytes != null && bytes < 0) + throw new RangeError("VRAM cap must be a non-negative number or null"); + else if (bytes != null) + bytes = Math.floor(bytes); + + this._vramOrchestrator.memoryCap = bytes; + } + + /** + * Get the current VRAM cap in bytes. See {@link setVramCap `setVramCap`} for more information. + * + * Defaults to `null`, which means no cap is set. + */ + public getVramCap() { + return this._vramOrchestrator.memoryCap; + } + + /** + * Cap the amount of RAM that this Llama instance is allowed to use in bytes. + * This is useful for constraining the resource usage of models and contexts created with the Llama instance. + * + * Capping to a value that's too low may cause model loads and context creations to fail. + * Capping to any value will exclude swap from the resource calculations, + * so extremely large models may not load at all even if you have enough swap available. + * + * Setting a cap will only affect future model loads and context creations. + * + * Use with caution. + * Setting to `null` disables the cap. + * + * Defaults to `null`. + */ + public async setRamCap(bytes: number | null) { + this._ensureNotDisposed(); + + if (bytes != null && bytes < 0) + throw new RangeError("RAM cap must be a non-negative number or null"); + else if (bytes != null) + bytes = Math.floor(bytes); + + this._ramOrchestrator.memoryCap = bytes; + this._swapOrchestrator.memoryCap = bytes == null ? null : 0; // if RAM is capped, we can't count on swap for calculation + } + + /** + * Get the current RAM cap in bytes. See {@link setRamCap `setRamCap`} for more information. + * + * Defaults to `null`, which means no cap is set. + */ + public getRamCap() { + return this._ramOrchestrator.memoryCap; + } + public async getGpuDeviceNames() { this._ensureNotDisposed(); @@ -419,6 +502,14 @@ export class Llama { this._onAddonLog(LlamaLogLevelToAddonLogLevel.get(level) ?? defaultLogLevel, message + "\n"); } + /** + * Check whether a message with the given log level would be logged by the Llama instance + * @internal + */ + public _shouldLog(level: LlamaLogLevel) { + return LlamaLogLevelGreaterThanOrEqual(level, this._logLevel); + } + /** @internal */ public _createTempFilePath() { if (this._tempDir == null) @@ -576,11 +667,10 @@ export class Llama { }; }); - let resolvedRamPadding: MemoryReservation; if (ramPadding instanceof Function) - resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding((await ramOrchestrator.getMemoryState()).total)); + ramOrchestrator.padding = ramPadding((await ramOrchestrator.getMemoryState()).total); else - resolvedRamPadding = ramOrchestrator.reserveMemory(ramPadding); + ramOrchestrator.padding = ramPadding; const resolvedTempDir = tempDir === false ? undefined @@ -608,23 +698,17 @@ export class Llama { buildGpu: buildMetadata.buildOptions.gpu, vramOrchestrator, maxThreads, - vramPadding: vramOrchestrator.reserveMemory(0), ramOrchestrator, - ramPadding: resolvedRamPadding, swapOrchestrator, skipLlamaInit }); if (llama.gpu === false || vramPadding === 0) { - // do nothing since `llama._vramPadding` is already set to 0 + // do nothing since `llama._vramOrchestrator.padding` is already set to 0 } else if (vramPadding instanceof Function) { - const currentVramPadding = llama._vramPadding; - llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding((await vramOrchestrator.getMemoryState()).total)); - currentVramPadding.dispose(); + vramOrchestrator.padding = vramPadding((await vramOrchestrator.getMemoryState()).total); } else { - const currentVramPadding = llama._vramPadding; - llama._vramPadding = vramOrchestrator.reserveMemory(vramPadding); - currentVramPadding.dispose(); + vramOrchestrator.padding = vramPadding; } if (!skipLlamaInit) @@ -726,6 +810,10 @@ function getTransformedLogLevel(level: LlamaLogLevel, message: string, gpu: Buil return LlamaLogLevel.info; else if (level === LlamaLogLevel.warn && message.startsWith("llama_model_loader: direct I/O is not available, using mmap")) return LlamaLogLevel.info; + else if (level === LlamaLogLevel.warn && message.startsWith("str: cannot properly format tensor name ")) + return LlamaLogLevel.info; + else if (level === LlamaLogLevel.warn && message.startsWith("llama_kv_cache: the V embeddings have different sizes across layers and FA is not enabled - padding V cache to")) + return LlamaLogLevel.info; else if (gpu === false && level === LlamaLogLevel.warn && message.startsWith("llama_adapter_lora_init_impl: lora for '") && message.endsWith("' cannot use buft 'CPU_REPACK', fallback to CPU")) return LlamaLogLevel.info; else if (gpu === "metal" && level === LlamaLogLevel.warn && message.startsWith("ggml_metal_device_init: tensor API disabled for")) diff --git a/src/bindings/getLlama.ts b/src/bindings/getLlama.ts index f9c900fe..ae85192c 100644 --- a/src/bindings/getLlama.ts +++ b/src/bindings/getLlama.ts @@ -644,7 +644,7 @@ export async function getLlamaForOptions({ if (isGithubReleaseNeedsResolving(llamaCppInfo.release)) { const [owner, name] = defaultLlamaCppGitHubRepo.split("/"); - llamaCppInfo.release = await resolveGithubRelease(owner!, name!, llamaCppInfo.release); + llamaCppInfo.release = (await resolveGithubRelease(owner!, name!, llamaCppInfo.release)).tag; } } @@ -705,7 +705,7 @@ export async function getLlamaForOptions({ if (isGithubReleaseNeedsResolving(llamaCppInfo.release)) { const [owner, name] = defaultLlamaCppGitHubRepo.split("/"); - llamaCppInfo.release = await resolveGithubRelease(owner!, name!, llamaCppInfo.release); + llamaCppInfo.release = (await resolveGithubRelease(owner!, name!, llamaCppInfo.release)).tag; } } diff --git a/src/bindings/utils/MemoryOrchestrator.ts b/src/bindings/utils/MemoryOrchestrator.ts index 992f336e..61753063 100644 --- a/src/bindings/utils/MemoryOrchestrator.ts +++ b/src/bindings/utils/MemoryOrchestrator.ts @@ -3,11 +3,17 @@ import {EventRelay} from "lifecycle-utils"; export class MemoryOrchestrator { /** @internal */ private readonly _getMemoryState: () => {free: number, total: number, unifiedSize: number}; /** @internal */ private _reservedMemory: number = 0; + /** @internal */ public _markedMemory: number = 0; + /** @internal */ private _memoryCap: number | null = null; + /** @internal */ private _padding: number = 0; public readonly onMemoryReservationRelease = new EventRelay(); + public readonly onMemoryMarkingRelease = new EventRelay(); public constructor(getMemoryState: () => {free: number, total: number, unifiedSize: number}) { this._getMemoryState = getMemoryState; + + this._onMarkFinalized = this._onMarkFinalized.bind(this); } public reserveMemory(bytes: number) { @@ -19,8 +25,42 @@ export class MemoryOrchestrator { }); } + public markAllocation(bytes: number) { + this._markedMemory += bytes; + + return MemoryMarking._create(bytes, this); + } + + public set padding(bytes: number) { + this._padding = bytes; + } + + public get padding() { + return this._padding; + } + + public set memoryCap(maxBytes: number | null) { + this._memoryCap = maxBytes ?? null; + } + + public get memoryCap() { + return this._memoryCap; + } + + public get markedMemory() { + return this._markedMemory; + } + public async getMemoryState() { - const {free, total, unifiedSize} = this._getMemoryState(); + let {free, total, unifiedSize} = this._getMemoryState(); + + free = Math.max(0, free - this._padding); + + if (this._memoryCap != null) { + total = Math.min(total, this._memoryCap); + free = Math.max(0, Math.min(free, this._memoryCap, total - this._markedMemory)); + unifiedSize = Math.min(unifiedSize, this._memoryCap); + } return { free: Math.max(0, free - this._reservedMemory), @@ -28,6 +68,12 @@ export class MemoryOrchestrator { unifiedSize }; } + + /** @internal */ + public _onMarkFinalized(bytes: number) { + this._markedMemory -= bytes; + this.onMemoryMarkingRelease.dispatchEvent(); + } } export class MemoryReservation { @@ -62,3 +108,41 @@ export class MemoryReservation { return new MemoryReservation(bytes, dispose); } } + +export class MemoryMarking { + /** @internal */ private readonly _size: number; + /** @internal */ private _orchestrator?: MemoryOrchestrator; + /** @internal */ private _finalizationRegistry: FinalizationRegistry; + + private constructor(size: number, orchestrator: MemoryOrchestrator) { + this._size = size; + this._orchestrator = orchestrator; + this._finalizationRegistry = new FinalizationRegistry(orchestrator._onMarkFinalized); + this._finalizationRegistry.register(this, size); + } + + public get size(): number { + return this._size; + } + + public get disposed(): boolean { + return this._orchestrator == null; + } + + public [Symbol.dispose](): void { + this.dispose(); + } + + public dispose(): void { + if (this._orchestrator != null) { + this._orchestrator._onMarkFinalized(this._size); + this._finalizationRegistry.unregister(this); + } + + this._orchestrator = undefined; + } + + public static _create(bytes: number, orchestrator: MemoryOrchestrator): MemoryMarking { + return new MemoryMarking(bytes, orchestrator); + } +} diff --git a/src/bindings/utils/applyLlamaCppRepoPatches.ts b/src/bindings/utils/applyLlamaCppRepoPatches.ts new file mode 100644 index 00000000..7bc9db06 --- /dev/null +++ b/src/bindings/utils/applyLlamaCppRepoPatches.ts @@ -0,0 +1,108 @@ +import path from "path"; +import fs from "fs-extra"; +import {simpleGit} from "simple-git"; +import {GitHubClient} from "../../utils/GitHubClient.js"; +import {llamaCppDirectory, llamaCppPatchesDirectory} from "../../config.js"; +import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; + +type RepoPatch = { + filename: string, + title: string, + canSkip(repoPath: string, lastCommitDate?: Date): Promise +}; + +const patches: RepoPatch[] = [{ + // https://github.com/ggml-org/llama.cpp/pull/22341 + filename: "PR-22341.diff", + title: "ggml: `gguf_init_from_callback` and `gguf_init_from_buffer`", + async canSkip(repoPath) { + const ggufH = await fs.readFile(path.join(repoPath, "ggml", "include", "gguf.h"), "utf8"); + + return !ggufH.includes("//GGML_API struct gguf_context * gguf_init_from_buffer") && + /\n\s*GGML_API struct gguf_context\s*\*\s*gguf_init_from_buffer\s*\(const\s+void\s*\*\s*data/.test(ggufH); + } +}, { + // https://github.com/ggml-org/llama.cpp/pull/22566 + filename: "PR-22566.diff", + title: "fix: consistent memory breakdown for models loaded with `no_alloc`", + async canSkip(repoPath, lastCommitDate) { + if (await fs.pathExists(path.join(repoPath, "tests", "test-model-load-buffer.cpp"))) + return true; + + if (lastCommitDate == null) + return false; + + try { + const githubClient = new GitHubClient(); + const pullRequestStatus = await githubClient.getPullRequestStatus({ + owner: "ggml-org", + repo: "llama.cpp", + id: "22566" + }); + + if (pullRequestStatus.merged && pullRequestStatus.merged_at != null) { + const mergedAt = new Date(pullRequestStatus.merged_at); + if (+mergedAt >= +lastCommitDate) + return true; + } + } catch (err) { + // do nothing + } + + return false; + } +}, { + // https://github.com/ggml-org/llama.cpp/pull/22742 + filename: "PR-22742.diff", + title: "model: don't crash on unsupported architecture", + async canSkip(repoPath) { + const llamaModel = await fs.readFile(path.join(repoPath, "src", "llama-model.cpp"), "utf8"); + return !llamaModel.includes('GGML_ABORT("unimplemented model class");'); + } +}]; + +export function hasLlamaCppRepoPatchesToApply() { + return patches.length > 0; +} + +export async function applyLlamaCppRepoPatches(lastCommitDate?: Date, throwOnError: boolean = false) { + if (!hasLlamaCppRepoPatchesToApply()) + return; + + if (!(await fs.pathExists(llamaCppPatchesDirectory)) || !(await fs.pathExists(llamaCppDirectory))) + return; + + const git = simpleGit({baseDir: llamaCppDirectory}); + for (const patch of patches) { + const patchPath = path.join(path.resolve(llamaCppPatchesDirectory), patch.filename); + + try { + if (!(await fs.pathExists(patchPath))) { + console.warn(`Patch file "${patch.filename}" not found, skipping patch "${patch.title}"`); + continue; + } + + if (await patch.canSkip(llamaCppDirectory, lastCommitDate)) + continue; + } catch (err) { + console.warn( + getConsoleLogPrefix(), + `Failed testing whether patch "${patch.filename}": "${patch.title}" can be skipped:`, + String(err) + ); + } + + try { + await git.applyPatch(patchPath, {"--ignore-whitespace": null}); + } catch (err) { + console.error( + getConsoleLogPrefix(), + `Failed to apply patch "${patch.filename}": "${patch.title}", building llama.cpp may fail.`, + String(err) + ); + + if (throwOnError) + throw err; + } + } +} diff --git a/src/bindings/utils/cloneLlamaCppRepo.ts b/src/bindings/utils/cloneLlamaCppRepo.ts index d6f3f370..49cd4d84 100644 --- a/src/bindings/utils/cloneLlamaCppRepo.ts +++ b/src/bindings/utils/cloneLlamaCppRepo.ts @@ -6,7 +6,7 @@ import which from "which"; import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, enableRecursiveClone, llamaCppDirectory, llamaCppDirectoryInfoFilePath } from "../../config.js"; -import {getGitBundlePathForRelease} from "../../utils/gitReleaseBundles.js"; +import {getGitBundlePathForRelease, isGitBundleCompatible} from "../../utils/gitReleaseBundles.js"; import {withLockfile} from "../../utils/withLockfile.js"; import {waitForLockfileRelease} from "../../utils/waitForLockfileRelease.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; @@ -15,6 +15,7 @@ import {isGithubReleaseNeedsResolving, resolveGithubRelease} from "../../utils/r import withStatusLogs from "../../utils/withStatusLogs.js"; import {withProgressLog} from "../../utils/withProgressLog.js"; import {logDistroInstallInstruction} from "./logDistroInstallInstruction.js"; +import {applyLlamaCppRepoPatches, hasLlamaCppRepoPatchesToApply} from "./applyLlamaCppRepoPatches.js"; type ClonedLlamaCppRepoTagFile = { tag: string, @@ -184,19 +185,26 @@ export async function ensureLlamaCppRepoIsCloned({progressLogs = true}: {progres console.log(getConsoleLogPrefix() + chalk.blue("Cloning llama.cpp")); let releaseTag = defaultLlamaCppRelease; + let releaseDate: Date | undefined = undefined; - if (isGithubReleaseNeedsResolving(releaseTag)) { + if (isGithubReleaseNeedsResolving(releaseTag) || ( + hasLlamaCppRepoPatchesToApply() && + !(await isGitBundleCompatible(githubOwner!, githubRepo!, releaseTag)) + )) { await withStatusLogs({ loading: chalk.blue("Fetching llama.cpp info"), success: chalk.blue("Fetched llama.cpp info"), fail: chalk.blue("Failed to fetch llama.cpp info"), disableLogs: !progressLogs }, async () => { - releaseTag = await resolveGithubRelease(githubOwner!, githubRepo!, releaseTag); + const release = await resolveGithubRelease(githubOwner!, githubRepo!, releaseTag); + releaseTag = release.tag; + releaseDate = release.date; }); } await cloneLlamaCppRepo(githubOwner!, githubRepo!, releaseTag, true, progressLogs); + await applyLlamaCppRepoPatches(releaseDate); } async function updateClonedLlamaCppRepoTagFile(githubOwner: string, githubRepo: string, tag: string) { diff --git a/src/chatWrappers/Gemma4ChatWrapper.ts b/src/chatWrappers/Gemma4ChatWrapper.ts new file mode 100644 index 00000000..ed1c9a34 --- /dev/null +++ b/src/chatWrappers/Gemma4ChatWrapper.ts @@ -0,0 +1,256 @@ +import {ChatWrapper, ChatWrapperJinjaMatchConfiguration} from "../ChatWrapper.js"; +import { + ChatModelFunctionCall, ChatModelFunctions, ChatModelResponse, ChatWrapperGenerateContextStateOptions, ChatWrapperGeneratedContextState, + ChatWrapperSettings +} from "../types.js"; +import {LlamaText, SpecialToken, SpecialTokensText} from "../utils/LlamaText.js"; +import {jsonDumps} from "./utils/jsonDumps.js"; + +// source: https://ai.google.dev/gemma/docs/core/prompt-formatting-gemma4 +export class Gemma4ChatWrapper extends ChatWrapper { + public readonly wrapperName: string = "Gemma 4"; + + public readonly reasoning: boolean; + public readonly keepOnlyLastThought: boolean; + + public override readonly settings: ChatWrapperSettings = { + supportsSystemMessages: true, + functions: { + call: { + optionalPrefixSpace: false, + prefix: LlamaText(new SpecialTokensText("<|tool_call>call:")), + paramsPrefix: "{", + suffix: LlamaText(new SpecialTokensText("}")), + emptyCallParamsPlaceholder: undefined + }, + result: { + prefix: LlamaText(new SpecialTokensText("response:"), "{{functionName}}", "{"), + suffix: LlamaText(new SpecialTokensText("}")) + } + }, + segments: { + reiterateStackAfterFunctionCalls: true, + thought: { + prefix: LlamaText(new SpecialTokensText("<|channel>thought\n")), + suffix: LlamaText(new SpecialTokensText("")) + } + } + }; + + public constructor(options: { + /** + * Whether to promote the model to perform reasoning. + * + * Defaults to `true`. + */ + reasoning?: boolean, + + /** + * Whether to keep only the chain of thought from the last model response. + * + * Setting this to `false` will keep all the chain of thoughts from the model responses in the context state. + * + * Defaults to `true`. + */ + keepOnlyLastThought?: boolean + } = {}) { + super(); + + const { + reasoning = true, + keepOnlyLastThought = true + } = options; + + this.reasoning = reasoning; + this.keepOnlyLastThought = keepOnlyLastThought; + } + + public override generateContextState({ + chatHistory, availableFunctions, documentFunctionParams + }: ChatWrapperGenerateContextStateOptions): ChatWrapperGeneratedContextState { + const hasFunctions = Object.keys(availableFunctions ?? {}).length > 0; + const modifiedChatHistory = chatHistory.slice(); + + let systemMessage: LlamaText = LlamaText(); + if (modifiedChatHistory[0]?.type === "system") { + systemMessage = LlamaText.fromJSON(modifiedChatHistory[0].text); + modifiedChatHistory.shift(); + } + + if (hasFunctions) + systemMessage = LlamaText([ + systemMessage, + this.generateAvailableFunctionsSystemText(availableFunctions ?? {}, {documentParams: documentFunctionParams}) + ]); + + if (this.reasoning) + systemMessage = LlamaText([ + new SpecialTokensText("<|think|>"), + systemMessage + ]); + + if (systemMessage.values.length > 0) + modifiedChatHistory.unshift({ + type: "system", + text: systemMessage.toJSON() + }); + + const contextContent: LlamaText[] = [ + LlamaText(new SpecialToken("BOS")) + ]; + + for (let i = 0; i < modifiedChatHistory.length; i++) { + const isLastItem = i === modifiedChatHistory.length - 1; + const item = modifiedChatHistory[i]; + + if (item == null) + continue; + + if (item.type === "system") + contextContent.push( + LlamaText([ + new SpecialTokensText("<|turn>system\n"), + LlamaText.fromJSON(item.text), + isLastItem + ? LlamaText([]) + : new SpecialTokensText("\n") + ]) + ); + else if (item.type === "user") + contextContent.push( + LlamaText([ + new SpecialTokensText("<|turn>user\n"), + item.text, + isLastItem + ? LlamaText([]) + : new SpecialTokensText("\n") + ]) + ); + else if (item.type === "model") + contextContent.push(this._getModelResponse(item.response, true, isLastItem, this.keepOnlyLastThought)); + else + void (item satisfies never); + } + + return { + contextText: LlamaText(contextContent), + stopGenerationTriggers: [ + LlamaText(new SpecialToken("EOS")), + LlamaText(new SpecialToken("EOT")), + LlamaText(new SpecialTokensText("")), + LlamaText(new SpecialTokensText("\n")), + LlamaText("<|return|>") + ] + }; + } + + public override generateAvailableFunctionsSystemText(availableFunctions: ChatModelFunctions, {documentParams = true}: { + documentParams?: boolean + }): LlamaText { + return LlamaText( + Object.entries(availableFunctions) + .map(([name, definition]) => { + return LlamaText([ + new SpecialTokensText("<|tool>"), + "declaration:", name, "{", + jsonDumps({ + description: definition.description || undefined, + parameters: documentParams + ? (definition.params || {}) + : undefined + }), + "}", new SpecialTokensText("") + ]); + }) + ); + } + + public override generateModelResponseText(modelResponse: ChatModelResponse["response"], useRawValues: boolean = true): LlamaText { + return this._getModelResponse(modelResponse, useRawValues, false, false); + } + + /** @internal */ + private _getModelResponse( + modelResponse: ChatModelResponse["response"], + useRawValues: boolean, + isLastItem: boolean, + keepOnlyLastThought: boolean + ) { + const res: LlamaText[] = [ + LlamaText(new SpecialTokensText("<|turn>model\n")) + ]; + const pendingFunctionCalls: ChatModelFunctionCall[] = []; + + const addPendingFunctions = () => { + if (pendingFunctionCalls.length === 0) + return; + + res.push(this.generateFunctionCallsAndResults(pendingFunctionCalls, useRawValues)); + + pendingFunctionCalls.length = 0; + }; + + for (let index = 0; index < modelResponse.length; index++) { + const isLastResponse = index === modelResponse.length - 1; + const response = modelResponse[index]; + + if (response == null) + continue; + else if (response === "" && (!isLastResponse || !isLastItem)) + continue; + + if (typeof response === "string") { + addPendingFunctions(); + res.push(LlamaText(response)); + } else if (response.type === "segment") { + addPendingFunctions(); + + if (response.ended && response.raw != null && useRawValues) + res.push(LlamaText.fromJSON(response.raw)); + else if (response.segmentType === "thought") { + if (keepOnlyLastThought && !isLastItem) + continue; + + res.push( + LlamaText([ + new SpecialTokensText("<|channel>thought"), + response.text, + (isLastItem && !response.ended) + ? LlamaText([]) + : new SpecialTokensText("") + ]) + ); + } else if (response.segmentType === "comment") + continue; // unsupported + else + void (response.segmentType satisfies never); + } else if (response.type === "functionCall") { + if (response.startsNewChunk) + addPendingFunctions(); + + pendingFunctionCalls.push(response); + } else + void (response satisfies never); + } + + addPendingFunctions(); + + if (!isLastItem) + res.push(LlamaText(new SpecialTokensText("\n"))); + + return LlamaText(res); + } + + /** @internal */ + public static override _getOptionConfigurationsToTestIfCanSupersedeJinjaTemplate(): ChatWrapperJinjaMatchConfiguration { + return [ + [{}, {}], + [{reasoning: false}, {}], + [ + {}, + {}, + {additionalRenderParameters: {"enable_thinking": true}} + ] + ]; + } +} diff --git a/src/chatWrappers/utils/resolveChatWrapper.ts b/src/chatWrappers/utils/resolveChatWrapper.ts index 8cadf9e8..b1faa330 100644 --- a/src/chatWrappers/utils/resolveChatWrapper.ts +++ b/src/chatWrappers/utils/resolveChatWrapper.ts @@ -7,6 +7,7 @@ import {FalconChatWrapper} from "../FalconChatWrapper.js"; import {FunctionaryChatWrapper} from "../FunctionaryChatWrapper.js"; import {AlpacaChatWrapper} from "../AlpacaChatWrapper.js"; import {GemmaChatWrapper} from "../GemmaChatWrapper.js"; +import {Gemma4ChatWrapper} from "../Gemma4ChatWrapper.js"; import {JinjaTemplateChatWrapper, JinjaTemplateChatWrapperOptions} from "../generic/JinjaTemplateChatWrapper.js"; import {TemplateChatWrapper} from "../generic/TemplateChatWrapper.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; @@ -27,7 +28,7 @@ import type {GgufFileInfo} from "../../gguf/types/GgufFileInfoTypes.js"; export const specializedChatWrapperTypeNames = Object.freeze([ "general", "deepSeek", "qwen", "llama3.2-lightweight", "llama3.1", "llama3", "llama2Chat", "mistral", "alpacaChat", "functionary", - "chatML", "falconChat", "gemma", "harmony", "seed" + "chatML", "falconChat", "gemma4", "gemma", "harmony", "seed" ] as const); export type SpecializedChatWrapperTypeName = (typeof specializedChatWrapperTypeNames)[number]; @@ -56,6 +57,7 @@ export const chatWrappers = Object.freeze({ "functionary": FunctionaryChatWrapper, "chatML": ChatMLChatWrapper, "falconChat": FalconChatWrapper, + "gemma4": Gemma4ChatWrapper, "gemma": GemmaChatWrapper, "harmony": HarmonyChatWrapper, "seed": SeedChatWrapper, @@ -70,7 +72,8 @@ const chatWrapperToConfigType = new Map( ); const specializedChatWrapperRelatedTexts = { - "harmony": ["gpt", "gpt-oss"] + "harmony": ["gpt", "gpt-oss"], + "gemma4": ["gemma 4", "gemma-4"] } satisfies Partial>; export type BuiltInChatWrapperType = InstanceType; @@ -364,6 +367,8 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(Llama3ChatWrapper); else if (includesText(modelNames, ["Mistral", "Mistral Large", "Mistral Large Instruct", "Mistral-Large", "Codestral"])) return createSpecializedChatWrapper(MistralChatWrapper); + else if (includesText(modelNames, ["Gemma 4", "Gemma-4", "gemma-4"])) + return createSpecializedChatWrapper(Gemma4ChatWrapper); else if (includesText(modelNames, ["Gemma", "Gemma 2"])) return createSpecializedChatWrapper(GemmaChatWrapper); else if (includesText(modelNames, ["gpt-oss", "Gpt Oss", "Gpt-Oss", "openai_gpt-oss", "Openai_Gpt Oss", "openai.gpt-oss", "Openai.Gpt Oss"])) @@ -381,6 +386,8 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(SeedChatWrapper); else if (modelJinjaTemplate.includes("<|start|>") && modelJinjaTemplate.includes("<|channel|>")) return createSpecializedChatWrapper(HarmonyChatWrapper); + else if (modelJinjaTemplate.includes("<|turn>") && modelJinjaTemplate.includes("<|tool_call>call:")) + return createSpecializedChatWrapper(Gemma4ChatWrapper); else if (modelJinjaTemplate.includes("<|im_start|>")) return createSpecializedChatWrapper(ChatMLChatWrapper); else if (modelJinjaTemplate.includes("[INST]")) @@ -430,9 +437,12 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(FunctionaryChatWrapper); else if (lowercaseName === "dolphin" && splitLowercaseSubType.includes("mistral")) return createSpecializedChatWrapper(ChatMLChatWrapper); - else if (lowercaseName === "gemma") + else if (lowercaseName === "gemma") { + if (firstSplitLowercaseSubType === "4") + return createSpecializedChatWrapper(Gemma4ChatWrapper); + return createSpecializedChatWrapper(GemmaChatWrapper); - else if (splitLowercaseSubType.includes("chatml")) + } else if (splitLowercaseSubType.includes("chatml")) return createSpecializedChatWrapper(ChatMLChatWrapper); } } @@ -454,6 +464,8 @@ export function resolveChatWrapper( return createSpecializedChatWrapper(FalconChatWrapper); else if (arch === "gemma" || arch === "gemma2") return createSpecializedChatWrapper(GemmaChatWrapper); + else if (arch === "gemma4") + return createSpecializedChatWrapper(Gemma4ChatWrapper); } return null; diff --git a/src/cli/commands/ChatCommand.ts b/src/cli/commands/ChatCommand.ts index 88ec258e..55bd27e6 100644 --- a/src/cli/commands/ChatCommand.ts +++ b/src/cli/commands/ChatCommand.ts @@ -5,6 +5,7 @@ import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; +import bytes from "bytes"; import {chatCommandHistoryFilePath, defaultChatSystemPrompt, documentationPageUrls} from "../../config.js"; import {getIsInDocumentationMode} from "../../state.js"; import {ReplHistory} from "../../utils/ReplHistory.js"; @@ -50,6 +51,8 @@ type ChatCommand = { kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, + maxRam?: string, + maxVram?: string, noTrimWhitespace: boolean, grammar: "text" | Parameters[1], jsonSchemaGrammarFile?: string, @@ -80,7 +83,7 @@ type ChatCommand = { numa?: LlamaNuma, meter: boolean, timing: boolean, - noMmap: boolean, + mmap?: boolean, useDirectIo: boolean, printTimings: boolean }; @@ -172,8 +175,7 @@ export const ChatCommand: CommandModule = { .option("flashAttention", { alias: "fa", type: "boolean", - default: false, - description: "Enable flash attention" + description: "Force enable flash attention. Flash attention is enabled by default when supported. You can force disable flash attention via `--no-fa`" }) .option("kvCacheKeyType", { alias: "kvckt", @@ -201,6 +203,16 @@ export const ChatCommand: CommandModule = { default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("noTrimWhitespace", { type: "boolean", alias: ["noTrim"], @@ -383,10 +395,9 @@ export const ChatCommand: CommandModule = { default: false, description: "Print how how long it took to generate each response" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) .option("useDirectIo", { type: "boolean", @@ -403,22 +414,22 @@ export const ChatCommand: CommandModule = { async handler({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, - noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, + maxRam, maxVram, noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, reasoningBudget, noHistory, - environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, + environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }) { try { await RunChat({ modelPath, header, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, contextSize, - batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, noTrimWhitespace, grammar, jsonSchemaGrammarFile, - threads, temperature, minP, topK, topP, seed, xtc, + batchSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, + noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, useDirectIo, printTimings + debug, numa, meter, timing, mmap, useDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -431,17 +442,20 @@ export const ChatCommand: CommandModule = { async function RunChat({ modelPath: modelArg, header: headerArg, gpu, systemInfo, systemPrompt, systemPromptFile, prompt, promptFile, wrapper, noJinja, - contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, noTrimWhitespace, grammar: grammarArg, - jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, + contextSize, batchSize, kvCacheKeyType, kvCacheValueType, flashAttention, swaFullCache, maxRam, maxVram, + noTrimWhitespace, grammar: grammarArg, jsonSchemaGrammarFile: jsonSchemaGrammarFilePath, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, - tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }: ChatCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; if (reasoningBudget === -1) reasoningBudget = undefined; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? bytes.parse(maxRam) ?? undefined : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? bytes.parse(maxVram) ?? undefined : undefined; + const headers = resolveHeaderFlag(headerArg); const trimWhitespace = !noTrimWhitespace; @@ -462,7 +476,14 @@ async function RunChat({ numa }); const logBatchSize = batchSize != null; - const useMmap = !noMmap && llama.supportsMmap; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; + + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -673,7 +694,10 @@ async function RunChat({ printBos: true, printEos: true, logBatchSize, - tokenMeterEnabled: meter + tokenMeterEnabled: meter, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }); printInfoLine({ title: "Chat", diff --git a/src/cli/commands/CompleteCommand.ts b/src/cli/commands/CompleteCommand.ts index f3fca2a9..2f1c1da3 100644 --- a/src/cli/commands/CompleteCommand.ts +++ b/src/cli/commands/CompleteCommand.ts @@ -5,6 +5,7 @@ import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; +import bytes from "bytes"; import {getLlama} from "../../bindings/getLlama.js"; import { BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, @@ -38,6 +39,8 @@ type CompleteCommand = { kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, + maxRam?: string, + maxVram?: string, threads?: number, temperature: number, minP: number, @@ -62,7 +65,7 @@ type CompleteCommand = { numa?: LlamaNuma, meter: boolean, timing: boolean, - noMmap: boolean, + mmap?: boolean, useDirectIo: boolean, printTimings: boolean }; @@ -129,8 +132,7 @@ export const CompleteCommand: CommandModule = { .option("flashAttention", { alias: "fa", type: "boolean", - default: false, - description: "Enable flash attention" + description: "Force enable flash attention. Flash attention is enabled by default when supported. You can force disable flash attention via `--no-fa`" }) .option("kvCacheKeyType", { alias: "kvckt", @@ -158,6 +160,16 @@ export const CompleteCommand: CommandModule = { default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -303,10 +315,9 @@ export const CompleteCommand: CommandModule = { default: false, description: "Print how how long it took to generate each response" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) .option("useDirectIo", { type: "boolean", @@ -322,20 +333,20 @@ export const CompleteCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, - flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, useDirectIo, printTimings + debug, numa, meter, timing, mmap, useDirectIo, printTimings }) { try { await RunCompletion({ modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -348,15 +359,18 @@ export const CompleteCommand: CommandModule = { async function RunCompletion({ modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, mmap, useDirectIo, printTimings }: CompleteCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? bytes.parse(maxRam) ?? undefined : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? bytes.parse(maxVram) ?? undefined : undefined; + const headers = resolveHeaderFlag(headerArg); if (debug) @@ -376,7 +390,14 @@ async function RunCompletion({ numa }); const logBatchSize = batchSize != null; - const useMmap = !noMmap && llama.supportsMmap; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; + + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -552,7 +573,10 @@ async function RunCompletion({ useDirectIo, minTitleLength: "Complete".length + 1, logBatchSize, - tokenMeterEnabled: meter + tokenMeterEnabled: meter, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }); printInfoLine({ title: "Complete", diff --git a/src/cli/commands/InfillCommand.ts b/src/cli/commands/InfillCommand.ts index 7bdb7bd6..13ea8e31 100644 --- a/src/cli/commands/InfillCommand.ts +++ b/src/cli/commands/InfillCommand.ts @@ -5,6 +5,7 @@ import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; import prettyMilliseconds from "pretty-ms"; +import bytes from "bytes"; import {getLlama} from "../../bindings/getLlama.js"; import { BuildGpu, LlamaLogLevel, LlamaLogLevelGreaterThan, LlamaNuma, llamaNumaOptions, nodeLlamaCppGpuOptions, parseNodeLlamaCppGpuOption, @@ -40,6 +41,8 @@ type InfillCommand = { kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, swaFullCache?: boolean, + maxRam?: string, + maxVram?: string, threads?: number, temperature: number, minP: number, @@ -64,7 +67,7 @@ type InfillCommand = { numa?: LlamaNuma, meter: boolean, timing: boolean, - noMmap: boolean, + mmap?: boolean, useDirectIo: boolean, printTimings: boolean }; @@ -139,8 +142,7 @@ export const InfillCommand: CommandModule = { .option("flashAttention", { alias: "fa", type: "boolean", - default: false, - description: "Enable flash attention" + description: "Force enable flash attention. Flash attention is enabled by default when supported. You can force disable flash attention via `--no-fa`" }) .option("kvCacheKeyType", { alias: "kvckt", @@ -168,6 +170,16 @@ export const InfillCommand: CommandModule = { default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("threads", { type: "number", defaultDescription: "Number of cores that are useful for math on the current machine", @@ -313,10 +325,9 @@ export const InfillCommand: CommandModule = { default: false, description: "Print how how long it took to generate each response" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) .option("useDirectIo", { type: "boolean", @@ -332,20 +343,20 @@ export const InfillCommand: CommandModule = { }, async handler({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, - flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize, - debug, numa, meter, timing, noMmap, useDirectIo, printTimings + debug, numa, meter, timing, mmap, useDirectIo, printTimings }) { try { await RunInfill({ modelPath, header, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, - lastTokensRepeatPenalty, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, + threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, maxTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, mmap, useDirectIo, printTimings }); } catch (err) { await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing @@ -358,14 +369,17 @@ export const InfillCommand: CommandModule = { async function RunInfill({ modelPath: modelArg, header: headerArg, gpu, systemInfo, prefix, prefixFile, suffix, suffixFile, contextSize, batchSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, threads, temperature, minP, topK, topP, seed, xtc, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, dryRepeatPenaltyStrength, dryRepeatPenaltyBase, dryRepeatPenaltyAllowedLength, dryRepeatPenaltyLastTokens, - tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, useDirectIo, printTimings + tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, mmap, useDirectIo, printTimings }: InfillCommand) { if (contextSize === -1) contextSize = undefined; if (gpuLayers === -1) gpuLayers = undefined; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? bytes.parse(maxRam) ?? undefined : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? bytes.parse(maxVram) ?? undefined : undefined; + const headers = resolveHeaderFlag(headerArg); if (debug) @@ -385,7 +399,14 @@ async function RunInfill({ numa }); const logBatchSize = batchSize != null; - const useMmap = !noMmap && llama.supportsMmap; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; + + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, { flashAttention, @@ -574,7 +595,10 @@ async function RunInfill({ useMmap, useDirectIo, logBatchSize, - tokenMeterEnabled: meter + tokenMeterEnabled: meter, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }); printInfoLine({ title: "Infill", diff --git a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts index 5915632f..cd527ef4 100644 --- a/src/cli/commands/inspect/commands/InspectEstimateCommand.ts +++ b/src/cli/commands/inspect/commands/InspectEstimateCommand.ts @@ -2,6 +2,7 @@ import process from "process"; import {CommandModule} from "yargs"; import chalk from "chalk"; import fs from "fs-extra"; +import bytes from "bytes"; import {readGgufFileInfo} from "../../../../gguf/readGgufFileInfo.js"; import {resolveHeaderFlag} from "../../../utils/resolveHeaderFlag.js"; import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js"; @@ -33,10 +34,12 @@ type InspectEstimateCommand = { gpuLayers?: number | "max", contextSize?: number | "train", embedding?: boolean, - noMmap?: boolean, + mmap?: boolean, kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType, - swaFullCache?: boolean + swaFullCache?: boolean, + maxRam?: string, + maxVram?: string }; export const InspectEstimateCommand: CommandModule = { @@ -115,10 +118,9 @@ export const InspectEstimateCommand: CommandModule !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, default: "F16" as const, - description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" + description: "Experimental. The type of the key for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors", + group: "Optional:" }) .option("kvCacheValueType", { alias: "kvcvt", @@ -138,24 +141,41 @@ export const InspectEstimateCommand: CommandModule !/^\d+$/i.test(key)) as (keyof typeof GgmlType)[] ] as const, default: "F16" as const, - description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors" + description: "Experimental. The type of the value for the context KV cache tensors. Use `currentQuant` to use the same type as the current quantization of the model weights tensors", + group: "Optional:" }) .option("swaFullCache", { alias: "noSwa", type: "boolean", default: false, - description: "Disable SWA (Sliding Window Attention) on supported models" + description: "Disable SWA (Sliding Window Attention) on supported models", + group: "Optional:" + }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for the model and the context. If the estimated RAM usage exceeds this value, the compatibility score will be reduced. This is useful for estimating compatibility with devices that have limited RAM. You can set this to a value like `16GB` or `512MB`.", + group: "Optional:" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Experimental. Maximum VRAM to use for the model and the context. If the estimated VRAM usage exceeds this value, the compatibility score will be reduced. This is useful for estimating compatibility with devices that have limited VRAM. You can set this to a value like `8GB` or `256MB`.", + group: "Optional:" }); }, async handler({ - modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, noMmap, - kvCacheKeyType, kvCacheValueType, swaFullCache + modelPath: ggufPath, header: headerArg, gpu, gpuLayers, contextSize: contextSizeArg, embedding, mmap, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram }: InspectEstimateCommand) { if (gpuLayers === -1) gpuLayers = undefined; if (gpuLayers === -2) gpuLayers = "max"; if (contextSizeArg === -1) contextSizeArg = undefined; if (contextSizeArg === -2) contextSizeArg = "train"; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? (bytes.parse(maxRam) ?? undefined) : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? (bytes.parse(maxVram) ?? undefined) : undefined; + const headers = resolveHeaderFlag(headerArg); const [resolvedModelDestination, resolvedGgufPath] = isModelUri(ggufPath) @@ -182,7 +202,14 @@ export const InspectEstimateCommand: CommandModule default: false, description: "Disable SWA (Sliding Window Attention) on supported models" }) + .option("maxRam", { + alias: ["ram"], + type: "string", + description: "Maximum RAM to use for all the resources allocated by `node-llama-cpp`" + }) + .option("maxVram", { + alias: ["vram"], + type: "string", + description: "Maximum VRAM to use for all the resources allocated by `node-llama-cpp`" + }) .option("batchSize", { alias: "b", type: "number", @@ -155,10 +168,9 @@ export const InspectMeasureCommand: CommandModule default: "vram" as const, description: "Type of memory to measure" }) - .option("noMmap", { + .option("mmap", { type: "boolean", - default: false, - description: "Disable mmap (memory-mapped file) usage" + description: "Force mmap (memory-mapped file) usage. You can force disable mmap usage with `--no-mmap`. By default, mmap usage is automatically determined by `node-llama-cpp`" }) .option("noDirectIo", { type: "boolean", @@ -185,14 +197,17 @@ export const InspectMeasureCommand: CommandModule }, async handler({ modelPath: ggufPath, header: headerArg, gpu, minLayers, maxLayers, minContextSize, maxContextSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, - batchSize, measures = 10, memory: measureMemoryType, noMmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText, + kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, + batchSize, measures = 10, memory: measureMemoryType, mmap, noDirectIo, printHeaderBeforeEachLayer = true, evaluateText, repeatEvaluateText }: InspectMeasureCommand) { if (maxLayers === -1) maxLayers = undefined; if (maxContextSize === -1) maxContextSize = undefined; if (minLayers < 1) minLayers = 1; + const resolvedMaxRam = (typeof maxRam === "string" && maxRam !== "") ? (bytes.parse(maxRam) ?? undefined) : undefined; + const resolvedMaxVram = (typeof maxVram === "string" && maxVram !== "") ? (bytes.parse(maxVram) ?? undefined) : undefined; + const exitAfterEachMeasurement = measureMemoryType === "ram" || measureMemoryType === "all"; const headers = resolveHeaderFlag(headerArg); @@ -206,8 +221,15 @@ export const InspectMeasureCommand: CommandModule logLevel: LlamaLogLevel.error }); + await llama.setVramCap(resolvedMaxVram ?? null); + await llama.setRamCap(resolvedMaxRam ?? null); + const platform = getPlatform(); - const useMmap = !noMmap && llama.supportsMmap; + const useMmap = !llama.supportsMmap + ? false + : typeof mmap === "boolean" + ? mmap + : "auto"; const useDirectIo = !noDirectIo; const resolvedGgufPath = await resolveCommandGgufPath(ggufPath, llama, headers, { flashAttention, swaFullCache, useMmap, kvCacheKeyType, kvCacheValueType @@ -218,9 +240,11 @@ export const InspectMeasureCommand: CommandModule console.info(chalk.yellow("mmap:") + " " + ( !llama.supportsMmap ? "unsupported" - : useMmap - ? "enabled" - : "disabled" + : useMmap === "auto" + ? "auto" + : useMmap === true + ? "enabled" + : "disabled" )); if (platform !== "mac") // Direct I/O is not supported on macOS @@ -242,7 +266,18 @@ export const InspectMeasureCommand: CommandModule const totalVram = (await llama.getVramState()).total; const totalRam = os.totalmem(); - let lastGpuLayers = maxLayers ?? ggufInsights.totalLayers; + let lastGpuLayers = maxLayers ?? ( + resolvedMaxVram == null + ? ggufInsights.totalLayers + : (await ggufInsights.configurationResolver.resolveModelGpuLayersV2({ + fitContext: { + contextSize: minAllowedContextSizeInCalculations + } + }, { + useMmap, + defaultContextFlashAttention: flashAttention ?? undefined + })).gpuLayers + ); let previousContextSizeCheck: undefined | number = undefined; const resolvedKvCacheKeyType = kvCacheKeyType === "currentQuant" @@ -286,13 +321,15 @@ export const InspectMeasureCommand: CommandModule kvCacheKeyType: resolvedKvCacheKeyType, kvCacheValueType: resolvedKvCacheValueType, swaFullCache, + maxRam: resolvedMaxRam, + maxVram: resolvedMaxVram, batchSize, tests: measures, evaluateText: evaluateText == null ? undefined : evaluateText.repeat(repeatEvaluateText ?? 1), exitAfterMeasurement: exitAfterEachMeasurement, - onInfo({gpuLayers, result}) { + async onInfo({gpuLayers, result}) { if (lastGpuLayers !== gpuLayers) { lastGpuLayers = gpuLayers; previousContextSizeCheck = undefined; @@ -336,9 +373,9 @@ export const InspectMeasureCommand: CommandModule previousContextSizeCheck = result.contextSize; hadSuccessInThisProcess = true; - const modelResourceEstimation = ggufInsights.estimateModelResourceRequirements({ + const modelResourceEstimation = await ggufInsights.estimateModelResourceRequirementsV2({ gpuLayers: lastGpuLayers, - useMmap + useMmap: result.useMmap }); const modelVramEstimation = modelResourceEstimation.gpuVram; const modelVramEstimationDiffBytes = (modelVramEstimation < result.modelVramUsage ? "-" : "") + @@ -354,7 +391,7 @@ export const InspectMeasureCommand: CommandModule const contextResourceEstimation = previousContextSizeCheck == null ? undefined - : ggufInsights.estimateContextResourceRequirements({ + : await ggufInsights.estimateContextResourceRequirementsV2({ contextSize: previousContextSizeCheck, modelGpuLayers: lastGpuLayers, flashAttention, @@ -399,7 +436,11 @@ export const InspectMeasureCommand: CommandModule type: previousContextSizeCheck == null ? "Model" : "Context", - gpuLayers: String(lastGpuLayers), + gpuLayers: String(lastGpuLayers).padEnd("Layers".length - 1, " ") + ( + result.useMmap + ? chalk.gray("M") + : " " + ), contextSize: previousContextSizeCheck != null ? String(previousContextSizeCheck) : undefined, @@ -569,10 +610,11 @@ const expectedFileName = "InspectMeasureCommand"; async function measureModel({ modelPath, useMmap, useDirectIo, gpu, tests, initialMaxContextSize, maxContextSize, minContextSize, maxGpuLayers, minGpuLayers, - flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false, onInfo + flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, maxRam, maxVram, batchSize, evaluateText, exitAfterMeasurement = false, + onInfo }: { modelPath: string, - useMmap?: boolean, + useMmap?: "auto" | boolean, useDirectIo?: boolean, gpu?: BuildGpu | "auto", tests: number, @@ -585,6 +627,8 @@ async function measureModel({ kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, + maxRam?: number, + maxVram?: number, batchSize?: number, evaluateText?: string, exitAfterMeasurement?: boolean, @@ -602,13 +646,14 @@ async function measureModel({ modelVramUsage: number, modelRamUsage: number, contextSize?: number, + useMmap: boolean, contextVramUsage?: number, contextRamUsage?: number, contextStateSize?: number, totalVramUsage: number, totalRamUsage: number } - }): void + }): void | Promise }) { if (!detectedFileName.startsWith(expectedFileName)) { console.warn( @@ -679,7 +724,7 @@ async function measureModel({ cleanup(); } - subProcess.on("message", (message: ChildToParentMessage) => { + subProcess.on("message", async (message: ChildToParentMessage) => { if (message.type === "ready") { forkSucceeded = true; subProcess.send({ @@ -697,6 +742,8 @@ async function measureModel({ kvCacheKeyType, kvCacheValueType, swaFullCache, + maxRam, + maxVram, batchSize, evaluateText, exitAfterMeasurement @@ -716,7 +763,7 @@ async function measureModel({ } else if (message.type === "error") { lastGpuLayers = message.gpuLayers; - onInfo({ + await onInfo({ gpuLayers: lastGpuLayers, result: { type: "error", @@ -727,13 +774,14 @@ async function measureModel({ } else if (message.type === "stats") { lastGpuLayers = message.gpuLayers; - onInfo({ + await onInfo({ gpuLayers: message.gpuLayers, result: { type: "success", modelVramUsage: message.modelVramUsage, modelRamUsage: message.modelRamUsage, contextSize: message.contextSize, + useMmap: message.useMmap, contextVramUsage: message.contextVramUsage, contextRamUsage: message.contextRamUsage, contextStateSize: message.contextStateSize, @@ -746,7 +794,7 @@ async function measureModel({ subProcess.on("exit", (code) => { if (code !== 0 || !isPlannedExit) - onInfo({ + void onInfo({ gpuLayers: lastGpuLayers, result: { type: "crash", @@ -759,7 +807,7 @@ async function measureModel({ if (subProcess.killed || subProcess.exitCode != null) { if (subProcess.exitCode !== 0 || !isPlannedExit) - onInfo({ + void onInfo({ gpuLayers: lastGpuLayers, result: { type: "crash", @@ -857,6 +905,7 @@ async function runTestWorkerLogic() { modelVramUsage, modelRamUsage, contextSize: context.contextSize, + useMmap: model.useMmap, contextVramUsage: postContextVramUsage - preContextVramUsage, contextRamUsage: postContextRamUsage - preContextRamUsage, contextStateSize: context.stateSize, @@ -895,7 +944,7 @@ async function runTestWorkerLogic() { modelPath, useMmap, useDirectIo, gpuLayers, tests, startContextSize, maxContextSize, minContextSize, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, batchSize, evaluateText, exitAfterMeasurement = false }: { - modelPath: string, useMmap?: boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number, + modelPath: string, useMmap?: "auto" | boolean, useDirectIo?: boolean, gpuLayers: number, tests: number, startContextSize?: number, maxContextSize?: number, minContextSize?: number, flashAttention?: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, batchSize?: number, evaluateText?: string, exitAfterMeasurement?: boolean @@ -920,6 +969,7 @@ async function runTestWorkerLogic() { sendInfoBack({ type: "stats", gpuLayers: model.gpuLayers, + useMmap: model.useMmap, modelVramUsage: postModelVramUsage - preModelVramUsage, modelRamUsage: postModelRamUsage - preModelRamUsage, totalVramUsage: postModelVramUsage, @@ -975,6 +1025,9 @@ async function runTestWorkerLogic() { continue; } + await llama.setVramCap(message.maxVram ?? null); + await llama.setRamCap(message.maxRam ?? null); + const measurementsDone = await testWithGpuLayers({ modelPath: message.modelPath, useMmap: message.useMmap, @@ -1077,7 +1130,7 @@ function getNextItemInCheckContextSizesPlan(plan: number[], currentSize: number) type ParentToChildMessage = { type: "start", modelPath: string, - useMmap?: boolean, + useMmap?: "auto" | boolean, useDirectIo?: boolean, tests: number, maxGpuLayers: number, @@ -1086,6 +1139,8 @@ type ParentToChildMessage = { kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, + maxRam?: number, + maxVram?: number, batchSize?: number, initialMaxContextSize?: number, maxContextSize?: number, @@ -1104,6 +1159,7 @@ type ChildToParentMessage = { modelVramUsage: number, modelRamUsage: number, contextSize?: number, + useMmap: boolean, contextVramUsage?: number, contextRamUsage?: number, contextStateSize?: number, diff --git a/src/cli/commands/source/commands/DownloadCommand.ts b/src/cli/commands/source/commands/DownloadCommand.ts index d4e42bab..9333aebf 100644 --- a/src/cli/commands/source/commands/DownloadCommand.ts +++ b/src/cli/commands/source/commands/DownloadCommand.ts @@ -13,7 +13,7 @@ import {setBinariesGithubRelease} from "../../../../bindings/utils/binariesGithu import {downloadCmakeIfNeeded} from "../../../../utils/cmake.js"; import withStatusLogs from "../../../../utils/withStatusLogs.js"; import {getIsInDocumentationMode} from "../../../../state.js"; -import {getGitBundlePathForRelease, unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle} from "../../../../utils/gitReleaseBundles.js"; +import {getGitBundlePathForRelease, isGitBundleCompatible, unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle} from "../../../../utils/gitReleaseBundles.js"; import {cloneLlamaCppRepo} from "../../../../bindings/utils/cloneLlamaCppRepo.js"; import {getPlatform} from "../../../../bindings/utils/getPlatform.js"; import {resolveCustomCmakeOptions} from "../../../../bindings/utils/resolveCustomCmakeOptions.js"; @@ -26,6 +26,7 @@ import {getConsoleLogPrefix} from "../../../../utils/getConsoleLogPrefix.js"; import {getPrettyBuildGpuName} from "../../../../bindings/consts.js"; import {getPlatformInfo} from "../../../../bindings/utils/getPlatformInfo.js"; import {withCliCommandDescriptionDocsUrl} from "../../../utils/withCliCommandDescriptionDocsUrl.js"; +import {applyLlamaCppRepoPatches, hasLlamaCppRepoPatchesToApply} from "../../../../bindings/utils/applyLlamaCppRepoPatches.js"; type DownloadCommandArgs = { repo?: string, @@ -147,14 +148,20 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) { let githubReleaseTag: string | null = (useBundle && (await getGitBundlePathForRelease(githubOwner, githubRepo, release)) != null) ? release : null; + let githubReleaseDate: Date | undefined = undefined; - if (githubReleaseTag == null) + if (githubReleaseTag == null || ( + hasLlamaCppRepoPatchesToApply() && + !(await isGitBundleCompatible(githubOwner, githubRepo, githubReleaseTag)) + )) await withOra({ loading: chalk.blue("Fetching llama.cpp info"), success: chalk.blue("Fetched llama.cpp info"), fail: chalk.blue("Failed to fetch llama.cpp info") }, async () => { - githubReleaseTag = await resolveGithubRelease(githubOwner, githubRepo, release); + const githubRelease = await resolveGithubRelease(githubOwner, githubRepo, release); + githubReleaseTag = githubRelease.tag; + githubReleaseDate = githubRelease.date; }); await clearTempFolder(); @@ -170,6 +177,14 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) { await cloneLlamaCppRepo(githubOwner, githubRepo, githubReleaseTag!, useBundle); + if (isCI && updateBinariesReleaseMetadataAndSaveGitBundle) { + await setBinariesGithubRelease(githubReleaseTag!); + await unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle(); + + await applyLlamaCppRepoPatches(githubReleaseDate, true); + } else + await applyLlamaCppRepoPatches(githubReleaseDate, false); + if (!skipBuild) { for (let i = 0; i < buildGpusToTry.length; i++) { const gpuToTry = buildGpusToTry[i]; @@ -263,11 +278,6 @@ export async function DownloadLlamaCppCommand(args: DownloadCommandArgs) { logBinaryUsageExampleToConsole(buildOptions, gpu !== "auto", true); } - if (isCI && updateBinariesReleaseMetadataAndSaveGitBundle) { - await setBinariesGithubRelease(githubReleaseTag!); - await unshallowAndSquashCurrentRepoAndSaveItAsReleaseBundle(); - } - console.log(); console.log(); console.log(`${chalk.yellow("Repo:")} ${repo}`); diff --git a/src/cli/utils/interactivelyAskForModel.ts b/src/cli/utils/interactivelyAskForModel.ts index bd1cfb71..2a118ed1 100644 --- a/src/cli/utils/interactivelyAskForModel.ts +++ b/src/cli/utils/interactivelyAskForModel.ts @@ -60,7 +60,7 @@ export async function interactivelyAskForModel({ modelsDirectory, allowLocalModels = true, downloadIntent = true, - flashAttention = false, + flashAttention = "auto", swaFullCache = false, useMmap, kvCacheKeyType, @@ -70,9 +70,9 @@ export async function interactivelyAskForModel({ modelsDirectory?: string, allowLocalModels?: boolean, downloadIntent?: boolean, - flashAttention?: boolean, + flashAttention?: "auto" | boolean, swaFullCache?: boolean, - useMmap?: boolean, + useMmap?: "auto" | boolean, kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType }): Promise { @@ -126,7 +126,7 @@ export async function interactivelyAskForModel({ progressUpdater.setProgress(readItems / ggufFileNames.length, renderProgress()); const compatibilityScore = await ggufInsights?.configurationResolver.scoreModelConfigurationCompatibility({ - flashAttention: flashAttention && ggufInsights?.flashAttentionSupported, + flashAttention, swaFullCache, useMmap, kvCacheKeyType: kvCacheKeyType === "currentQuant" @@ -233,7 +233,7 @@ export async function interactivelyAskForModel({ try { while (true) { - const minWidth = Math.min(80 + (flashAttention ? 26 : 0), process.stdout.columns - 1); + const minWidth = Math.min(80 + (flashAttention !== false ? 26 : 0), process.stdout.columns - 1); const selectedItem = await basicChooseFromListConsoleInteraction({ title(item, rerender) { const title = chalk.bold("Select a model:") + " "; @@ -258,13 +258,19 @@ export async function interactivelyAskForModel({ chalk.dim("(" + toBytes(vramState.used) + "/" + toBytes(vramState.total) + ")") + " " ) + ( - !flashAttention + flashAttention === false ? "" : ( " " + chalk.bgGray( " " + - chalk.yellow("Flash attention:") + " " + "enabled" + + chalk.yellow("Flash attention:") + " " + ( + flashAttention === "auto" + ? "auto" + : flashAttention === true + ? "enabled" + : "disabled" + ) + " " ) ) @@ -424,8 +430,8 @@ async function askForModelUriOrPath(allowLocalModels: boolean): Promise void, abortSignal: AbortSignal, llama: Llama, flashAttention: boolean, - swaFullCache: boolean, useMmap: boolean | undefined, + item: ModelOption, focused: boolean, rerender: () => void, abortSignal: AbortSignal, llama: Llama, flashAttention: "auto" | boolean, + swaFullCache: boolean, useMmap: "auto" | boolean | undefined, kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType ) { if (item.type === "localModel") { @@ -584,9 +590,9 @@ async function selectFileForModelRecommendation({ llama: Llama, abortSignal: AbortSignal, rerenderOption(): void, - flashAttention: boolean, + flashAttention: "auto" | boolean, swaFullCache: boolean, - useMmap?: boolean, + useMmap?: "auto" | boolean, kvCacheKeyType?: "currentQuant" | GgmlType, kvCacheValueType?: "currentQuant" | GgmlType }) { diff --git a/src/cli/utils/printCommonInfoLines.ts b/src/cli/utils/printCommonInfoLines.ts index 47a81c42..42e4eb5d 100644 --- a/src/cli/utils/printCommonInfoLines.ts +++ b/src/cli/utils/printCommonInfoLines.ts @@ -15,17 +15,23 @@ export async function printCommonInfoLines({ logBatchSize = false, tokenMeterEnabled = false, printBos = false, - printEos = false + printEos = false, + resolvedMaxRam, + resolvedMaxVram, + swaFullCache }: { context: LlamaContext, draftContext?: LlamaContext, minTitleLength?: number, - useMmap?: boolean, + useMmap?: "auto" | boolean, useDirectIo?: boolean, logBatchSize?: boolean, tokenMeterEnabled?: boolean, printBos?: boolean, - printEos?: boolean + printEos?: boolean, + resolvedMaxRam?: number, + resolvedMaxVram?: number, + swaFullCache?: boolean }) { const platform = getPlatform(); const llama = context._llama; @@ -62,6 +68,28 @@ export async function printCommonInfoLines({ }] }); } + if (resolvedMaxRam != null || resolvedMaxVram != null || swaFullCache === true) + printInfoLine({ + title: "Options", + padTitle: padTitle, + info: [{ + show: resolvedMaxRam != null, + title: "Max RAM", + value: toBytes(resolvedMaxRam ?? 0) + }, { + show: resolvedMaxVram != null, + title: "Max VRAM", + value: toBytes(resolvedMaxVram ?? 0) + }, { + show: swaFullCache === true, + title: "SWA", + value: model.fileInsights.swaSize == null + ? "unsupported" + : swaFullCache === true + ? "disabled" + : "enabled" + }] + }); printInfoLine({ title: "Model", padTitle: padTitle, @@ -81,9 +109,13 @@ export async function printCommonInfoLines({ title: "mmap", value: !model._llama.supportsMmap ? "unsupported" - : (useMmap || useMmap == null) + : useMmap === true ? "enabled" - : "disabled" + : (useMmap === "auto" || useMmap == null) + ? model.useMmap + ? "auto (enabled)" + : "auto (disabled)" + : "disabled" }, { title: "Direct I/O", show: platform !== "mac", // Direct IO is not supported on macOS @@ -119,9 +151,12 @@ export async function printCommonInfoLines({ title: "Batch size", value: context.batchSize.toLocaleString("en-US") }, { - show: context.flashAttention, title: "Flash attention", - value: "enabled" + value: context.flashAttention === "auto" + ? "auto" + : context.flashAttention === true + ? "enabled" + : "disabled" }, { show: tokenMeterEnabled, title: "Token meter", @@ -178,9 +213,12 @@ export async function printCommonInfoLines({ title: "Batch size", value: draftContext.batchSize.toLocaleString("en-US") }, { - show: draftContext.flashAttention, title: "Flash attention", - value: "enabled" + value: draftContext.flashAttention === "auto" + ? "auto" + : draftContext.flashAttention === true + ? "enabled" + : "disabled" }, { show: tokenMeterEnabled, title: "Token meter", diff --git a/src/cli/utils/resolveCommandGgufPath.ts b/src/cli/utils/resolveCommandGgufPath.ts index a11cfa96..5af53393 100644 --- a/src/cli/utils/resolveCommandGgufPath.ts +++ b/src/cli/utils/resolveCommandGgufPath.ts @@ -14,10 +14,10 @@ import {getReadablePath} from "./getReadablePath.js"; import {interactivelyAskForModel} from "./interactivelyAskForModel.js"; export async function resolveCommandGgufPath(ggufPath: string | undefined, llama: Llama, fetchHeaders?: Record, { - targetDirectory = cliModelsDirectory, flashAttention = false, swaFullCache = false, useMmap, consoleTitle = "File", + targetDirectory = cliModelsDirectory, flashAttention = "auto", swaFullCache = false, useMmap, consoleTitle = "File", kvCacheKeyType, kvCacheValueType }: { - targetDirectory?: string, flashAttention?: boolean, swaFullCache?: boolean, useMmap?: boolean, consoleTitle?: string, + targetDirectory?: string, flashAttention?: "auto" | boolean, swaFullCache?: boolean, useMmap?: "auto" | boolean, consoleTitle?: string, kvCacheKeyType?: "currentQuant" | keyof typeof GgmlType, kvCacheValueType?: "currentQuant" | keyof typeof GgmlType } = {}) { diff --git a/src/config.ts b/src/config.ts index 5337d012..617a76af 100644 --- a/src/config.ts +++ b/src/config.ts @@ -24,6 +24,7 @@ export const projectTemplatesDirectory = path.join(__dirname, "..", "templates") export const localTempDirectory = path.join(__dirname, "..", ".temp"); export const packedProjectTemplatesDirectory = path.join(projectTemplatesDirectory, "packed"); export const llamaCppDirectory = path.join(llamaDirectory, "llama.cpp"); +export const llamaCppPatchesDirectory = path.join(llamaDirectory, "patches"); export const llamaCppGrammarsDirectory = path.join(llamaDirectory, "llama.cpp", "grammars"); export const tempDownloadDirectory = path.join(os.tmpdir(), "node-llama-cpp", nanoid()); export const cliHomedirDirectory = path.join(os.homedir(), ".node-llama-cpp"); diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 248c763e..cf85694e 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -16,6 +16,7 @@ import {safeEventCallback} from "../../utils/safeEventCallback.js"; import {GgufArchitectureType} from "../../gguf/types/GgufMetadataTypes.js"; import {LlamaLogLevel} from "../../bindings/types.js"; import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js"; +import {MemoryMarking} from "../../bindings/utils/MemoryOrchestrator.js"; import { BatchingOptions, BatchItem, ContextShiftOptions, ContextTokensDeleteRange, ControlledEvaluateIndexOutput, ControlledEvaluateInputItem, EvaluationPriority, LlamaContextOptions, LlamaContextSequenceDryRepeatPenalty, LlamaContextSequenceRepeatPenalty, PrioritizedBatchItem, @@ -71,7 +72,7 @@ export class LlamaContext { /** @internal */ private readonly _model: LlamaModel; /** @internal */ private readonly _contextSize: number; /** @internal */ private readonly _batchSize: number; - /** @internal */ private readonly _flashAttention: boolean; + /** @internal */ private readonly _flashAttention: "auto" | boolean; /** @internal */ private readonly _idealThreads: number; /** @internal */ private readonly _minThreads: number; /** @internal */ private readonly _performanceTracking: boolean; @@ -86,6 +87,8 @@ export class LlamaContext { /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); /** @internal */ private readonly _modelPreventDisposalHandle: DisposalPreventionHandle; /** @internal */ private readonly _loraAdapters = new Set(); + /** @internal */ public _vramConsumptionMarking?: MemoryMarking; + /** @internal */ public _ramConsumptionMarking?: MemoryMarking; /** @internal */ private _nextGeneratedSequenceId = 0; /** @internal */ private _dispatchDecodeScheduled = false; /** @internal */ private _batchDispatchPending = false; @@ -121,7 +124,7 @@ export class LlamaContext { sequences: number, contextSize: number, batchSize: number, - flashAttention: boolean, + flashAttention: "auto" | boolean, experimentalKvCacheKeyType: GgmlType, experimentalKvCacheValueType: GgmlType }) { @@ -163,7 +166,9 @@ export class LlamaContext { : 0 ), sequences: this._totalSequences, - flashAttention: this._flashAttention, + flashAttention: this._flashAttention === "auto" + ? "auto" + : Boolean(this._flashAttention), threads: this._idealThreads, embeddings: _embeddings, ranking: _ranking, @@ -194,6 +199,8 @@ export class LlamaContext { this._disposeAggregator.add(async () => { await this._backendContextDisposeGuard.acquireDisposeLock(); await this._ctx.dispose(); + this._vramConsumptionMarking?.dispose(); + this._ramConsumptionMarking?.dispose(); this._modelPreventDisposalHandle.dispose(); }); } @@ -228,7 +235,7 @@ export class LlamaContext { return this._batchSize; } - public get flashAttention(): boolean { + public get flashAttention(): "auto" | boolean { return this._flashAttention; } @@ -888,9 +895,10 @@ export class LlamaContext { const kvUnified = false; const sequences = Math.max(1, Math.floor(options.sequences ?? getDefaultContextSequences())); - const flashAttention = _model.flashAttentionSupported - ? Boolean(options.flashAttention ?? _model.defaultContextFlashAttention) - : false; + const flashAttentionOption = options.flashAttention ?? _model.defaultContextFlashAttention; + const flashAttention = flashAttentionOption === "auto" + ? "auto" + : Boolean(flashAttentionOption); const kvCacheKeyType = options.experimentalKvCacheKeyType === "currentQuant" ? _model.fileInsights.dominantTensorType ?? _model.defaultContextKvCacheKeyType : resolveGgmlTypeOption(options.experimentalKvCacheKeyType) ?? _model.defaultContextKvCacheKeyType; @@ -940,7 +948,7 @@ export class LlamaContext { async function createContext(contextSize: number) { const batchSize = options.batchSize ?? getDefaultContextBatchSize({contextSize, sequences}); - const resourceRequirementsEstimation = _model.fileInsights.estimateContextResourceRequirements({ + const resourceRequirementsEstimation = await _model.fileInsights.estimateContextResourceRequirementsV2({ contextSize, sequences, isEmbeddingContext: options._embeddings, @@ -983,6 +991,9 @@ export class LlamaContext { } else if (!contextLoaded) throw new Error("Failed to create context"); + const memoryBreakdown = context._ctx.getMemoryBreakdown(); + context._vramConsumptionMarking = _model._llama._vramOrchestrator.markAllocation(memoryBreakdown.gpuVram); + context._ramConsumptionMarking = _model._llama._ramOrchestrator.markAllocation(memoryBreakdown.cpuRam); contextCreationVramReservation?.dispose?.(); contextCreationRamReservation?.dispose?.(); diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index a0a64f02..23480cf5 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -50,18 +50,14 @@ export type LlamaContextOptions = { /** * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. - * - * The support for flash attention is currently experimental and may not always work as expected. - * Use with caution. + * + * When set to `"auto"`, flash attention will automatically be used when supported by the model and hardware capabilities. * * This option will be ignored if flash attention is not supported by the model. * - * Defaults to `false` (inherited from the model option `defaultContextFlashAttention`). - * - * Upon flash attention exiting the experimental status, the default value will become `true` - * (the inherited value from the model option `defaultContextFlashAttention` will become `true`). + * Defaults to `"auto"` (inherited from the model option `defaultContextFlashAttention`). */ - flashAttention?: boolean, + flashAttention?: "auto" | boolean, /** * number of threads to use to evaluate tokens. diff --git a/src/evaluator/LlamaModel/LlamaModel.ts b/src/evaluator/LlamaModel/LlamaModel.ts index 84dfa65e..9c458a7d 100644 --- a/src/evaluator/LlamaModel/LlamaModel.ts +++ b/src/evaluator/LlamaModel/LlamaModel.ts @@ -8,7 +8,7 @@ import {DisposalPreventionHandle, DisposeGuard} from "../../utils/DisposeGuard.j import {LlamaLocks, LlamaLogLevel, LlamaVocabularyType, LlamaVocabularyTypeValues} from "../../bindings/types.js"; import {GgufFileInfo} from "../../gguf/types/GgufFileInfoTypes.js"; import {readGgufFileInfo} from "../../gguf/readGgufFileInfo.js"; -import {GgufInsights} from "../../gguf/insights/GgufInsights.js"; +import {GgufInsights, GgufInsightsResourceRequirements} from "../../gguf/insights/GgufInsights.js"; import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; import {Writable} from "../../utils/utilTypes.js"; import {getReadablePath} from "../../cli/utils/getReadablePath.js"; @@ -20,6 +20,7 @@ import {OverridesObject} from "../../utils/OverridesObject.js"; import {maxRecentDetokenizerTokens} from "../../consts.js"; import {LlamaRankingContext, LlamaRankingContextOptions} from "../LlamaRankingContext.js"; import {GgmlType, resolveGgmlTypeOption} from "../../gguf/types/GgufTensorInfoTypes.js"; +import {MemoryMarking} from "../../bindings/utils/MemoryOrchestrator.js"; import {TokenAttribute, TokenAttributes} from "./utils/TokenAttributes.js"; import type {Llama} from "../../bindings/Llama.js"; import type {BuiltinSpecialTokenValue} from "../../utils/LlamaText.js"; @@ -72,10 +73,12 @@ export type LlamaModelOptions = { * * When using mmap, you might notice a delay the first time you actually use the model, * which is caused by the OS itself loading the model into memory. + * + * When this option is set to `"auto"`, mmap may be disabled in scenarios where doing so allows more layers to be offloaded to the GPU. * - * Defaults to `true` if the current system supports it. + * Defaults to `"auto"` if the current system supports it. */ - useMmap?: boolean, + useMmap?: "auto" | boolean, /** * Direct I/O is a method of reading and writing data to and from the storage device directly to the application memory, @@ -113,20 +116,15 @@ export type LlamaModelOptions = { * * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. * - * The support for flash attention is currently experimental and may not always work as expected. - * Use with caution. - * * This option will be ignored if flash attention is not supported by the model. * * Enabling this affects the calculations of default values for the model and contexts created with it * as flash attention reduces the amount of memory required, * which allows for more layers to be offloaded to the GPU and for context sizes to be bigger. * - * Defaults to `false`. - * - * Upon flash attention exiting the experimental status, the default value will become `true`. + * Defaults to `"auto"`. */ - defaultContextFlashAttention?: boolean, + defaultContextFlashAttention?: "auto" | boolean, /** * The default type of the key for the KV cache tensors used for contexts created with this model. @@ -198,9 +196,9 @@ export type LlamaModelOptions = { metadataOverrides?: OverridesObject }; -const defaultUseMmap = true; +const defaultUseMmap = "auto" as const satisfies NonNullable; const defaultUseDirectIo = false; -const defaultContextFlashAttentionEnabled = false; +const defaultContextFlashAttentionOptionDefault = "auto" as const satisfies NonNullable; const defaultContextSwaFullCache = false; export class LlamaModel { @@ -212,18 +210,21 @@ export class LlamaModel { /** @internal */ private readonly _fileInfo: GgufFileInfo; /** @internal */ private readonly _fileInsights: GgufInsights; /** @internal */ private readonly _gpuLayers: number; + /** @internal */ public readonly _useMmap: boolean; /** @internal */ private readonly _vocabOnly: boolean; /** @internal */ private readonly _filename?: string; /** @internal */ private readonly _disposedState: DisposedState = {disposed: false}; /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); /** @internal */ private readonly _llamaPreventDisposalHandle: DisposalPreventionHandle; - /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: boolean; - /** @internal */ private readonly _defaultContextFlashAttention: boolean; + /** @internal */ private readonly _defaultContextFlashAttentionOptionEnabled: "auto" | boolean; + /** @internal */ private readonly _defaultContextFlashAttention: "auto" | boolean; /** @internal */ private readonly _defaultContextSwaFullCache: boolean; /** @internal */ private readonly _defaultContextKvCacheKeyType: GgmlType; /** @internal */ private readonly _defaultContextKvCacheValueType: GgmlType; /** @internal */ private readonly _flashAttentionSupported: boolean; /** @internal */ private readonly _loraAdapters = new Map(); + /** @internal */ public _vramConsumptionMarking?: MemoryMarking; + /** @internal */ public _ramConsumptionMarking?: MemoryMarking; /** @internal */ private _typeDescription?: ModelTypeDescription; /** @internal */ private _trainContextSize?: number; /** @internal */ private _embeddingVectorSize?: number; @@ -235,7 +236,8 @@ export class LlamaModel { private constructor({ modelPath, gpuLayers, vocabOnly = false, useMmap, useDirectIo, useMlock, checkTensors, onLoadProgress, loadSignal, metadataOverrides }: LlamaModelOptions & { - gpuLayers: number + gpuLayers: number, + useMmap: boolean }, { _llama, _fileInfo, @@ -250,8 +252,8 @@ export class LlamaModel { _llama: Llama, _fileInfo: GgufFileInfo, _fileInsights: GgufInsights, - _defaultContextFlashAttentionOptionEnabled: boolean, - _defaultContextFlashAttention: boolean, + _defaultContextFlashAttentionOptionEnabled: "auto" | boolean, + _defaultContextFlashAttention: "auto" | boolean, _defaultContextSwaFullCache: boolean, _defaultContextKvCacheKeyType: GgmlType, _defaultContextKvCacheValueType: GgmlType, @@ -262,6 +264,7 @@ export class LlamaModel { this._modelPath = path.resolve(process.cwd(), modelPath); this._fileInsights = _fileInsights; this._gpuLayers = gpuLayers; + this._useMmap = useMmap ?? false; this._vocabOnly = vocabOnly ?? false; this._backendModelDisposeGuard = new DisposeGuard([this._llama._backendDisposeGuard]); this._llamaPreventDisposalHandle = this._llama._backendDisposeGuard.createPreventDisposalHandle(); @@ -313,6 +316,8 @@ export class LlamaModel { this._disposeAggregator.add(async () => { await this._backendModelDisposeGuard.acquireDisposeLock(); await this._model.dispose(); + this._vramConsumptionMarking?.dispose(); + this._ramConsumptionMarking?.dispose(); this._llamaPreventDisposalHandle.dispose(); }); @@ -375,6 +380,16 @@ export class LlamaModel { return this._gpuLayers; } + /** + * Whether the model is loaded using mmap (memory-mapped file) or not. + * + * When Direct I/O (setting the `useDirectIo` option to `true`) is used it'll override mmap and this value may be out of sync + * with the actual usage of mmap for the loading of this model instance. + */ + public get useMmap(): boolean { + return this._useMmap; + } + /** * Total model size in memory in bytes. * @@ -658,7 +673,7 @@ export class LlamaModel { } try { - if (this._defaultContextFlashAttentionOptionEnabled && !this._flashAttentionSupported) { + if (this._defaultContextFlashAttentionOptionEnabled === true && !this._flashAttentionSupported) { if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.grok) warnings.push("Flash attention is incompatible with Grok and thus was turned off"); else if (this.fileInfo.metadata?.general?.architecture === GgufArchitectureType.gemma2) @@ -762,7 +777,11 @@ export class LlamaModel { experimentalDefaultContextKvCacheKeyType, experimentalDefaultContextKvCacheValueType } = modelOptions; - const useMmap = _llama.supportsMmap && (modelOptions.useMmap ?? defaultUseMmap); + const useMmap = !_llama.supportsMmap + ? false + : typeof modelOptions.useMmap === "boolean" + ? modelOptions.useMmap + : defaultUseMmap; const useDirectIo = modelOptions.useDirectIo ?? defaultUseDirectIo; const fileInfo = await readGgufFileInfo(modelOptions.modelPath, { @@ -771,9 +790,12 @@ export class LlamaModel { }); applyGgufMetadataOverrides(fileInfo, modelOptions.metadataOverrides); const ggufInsights = await GgufInsights.from(fileInfo, _llama); + ggufInsights._defaultUseMmap = useMmap === "auto" + ? true + : useMmap; const flashAttentionSupported = ggufInsights.flashAttentionSupported; const resolvedDefaultContextFlashAttention = flashAttentionSupported - ? (defaultContextFlashAttention ?? defaultContextFlashAttentionEnabled) + ? defaultContextFlashAttention ?? defaultContextFlashAttentionOptionDefault : false; const resolvedDefaultContextSwaFullCache = modelOptions.defaultContextSwaFullCache ?? defaultContextSwaFullCache; const resolvedDefaultContextKvCacheKeyType = experimentalDefaultContextKvCacheKeyType === "currentQuant" @@ -782,24 +804,39 @@ export class LlamaModel { const resolvedDefaultContextKvCacheValueType = experimentalDefaultContextKvCacheValueType === "currentQuant" ? ggufInsights.dominantTensorType ?? GgmlType.F16 : resolveGgmlTypeOption(experimentalDefaultContextKvCacheValueType) ?? GgmlType.F16; - const gpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(modelOptions.gpuLayers, { - ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, - defaultContextFlashAttention: resolvedDefaultContextFlashAttention, - defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, - defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType, - defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType, - useMmap - }); - const resourceRequirementsEstimation = ggufInsights.estimateModelResourceRequirements({ - gpuLayers: gpuLayers, - useMmap - }); + + let gpuLayers: number; + let resolvedUseMmap: boolean; + let resourceRequirementsEstimation: GgufInsightsResourceRequirements; + const simulatorSession = ggufInsights._createSimulatorSession(); + try { + const layersResolution = await ggufInsights.configurationResolver.resolveModelGpuLayersV2(modelOptions.gpuLayers, { + ignoreMemorySafetyChecks: modelOptions.ignoreMemorySafetyChecks, + defaultContextFlashAttention: resolvedDefaultContextFlashAttention, + defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, + defaultContextKvCacheKeyType: resolvedDefaultContextKvCacheKeyType, + defaultContextKvCacheValueType: resolvedDefaultContextKvCacheValueType, + useMmap, + + _simulatorSession: simulatorSession + }); + gpuLayers = layersResolution.gpuLayers; + resolvedUseMmap = layersResolution.useMmap; + resourceRequirementsEstimation = await ggufInsights.estimateModelResourceRequirementsV2({ + gpuLayers, + useMmap: resolvedUseMmap, + + _simulatorSession: simulatorSession + }); + } finally { + simulatorSession.dispose(); + } - const model = new LlamaModel({...modelOptions, gpuLayers, useMmap, useDirectIo}, { + const model = new LlamaModel({...modelOptions, gpuLayers, useMmap: resolvedUseMmap, useDirectIo}, { _fileInfo: fileInfo, _fileInsights: ggufInsights, _llama, - _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? false, + _defaultContextFlashAttentionOptionEnabled: defaultContextFlashAttention ?? defaultContextFlashAttentionOptionDefault, _flashAttentionSupported: flashAttentionSupported, _defaultContextFlashAttention: resolvedDefaultContextFlashAttention, _defaultContextSwaFullCache: resolvedDefaultContextSwaFullCache, @@ -853,6 +890,12 @@ export class LlamaModel { logWarnings(model.getWarnings()); + const memoryBreakdown = model._model.getMemoryBreakdown(); + model._vramConsumptionMarking = _llama._vramOrchestrator.markAllocation(memoryBreakdown.gpuVram); + model._ramConsumptionMarking = _llama._ramOrchestrator.markAllocation(memoryBreakdown.cpuRam); + modelCreationVramReservation?.dispose?.(); + modelCreationRamReservation?.dispose?.(); + return model; } finally { loadSignal?.removeEventListener("abort", onAbort); diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index ed364c35..e36cf67f 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -1,4 +1,7 @@ +import {acquireLock, withLock} from "lifecycle-utils"; +import bytes from "bytes"; import {Llama} from "../../bindings/Llama.js"; +import {LlamaLogLevel} from "../../bindings/types.js"; import {getLlamaWithoutBackend} from "../../bindings/utils/getLlamaWithoutBackend.js"; import {getDefaultContextBatchSize, getDefaultContextSequences} from "../../evaluator/LlamaContext/LlamaContext.js"; import {GgufFileInfo} from "../types/GgufFileInfoTypes.js"; @@ -6,8 +9,13 @@ import {GgmlType, GgufTensorInfo} from "../types/GgufTensorInfoTypes.js"; import {GgufArchitectureType} from "../types/GgufMetadataTypes.js"; import {getReadablePath} from "../../cli/utils/getReadablePath.js"; import {padSafeContextSize} from "../../evaluator/LlamaContext/utils/padSafeContextSize.js"; +import {removeNullFields, removeUndefinedFields} from "../../utils/removeNullFields.js"; +import {LruCache} from "../../utils/LruCache.js"; import {GgufInsightsConfigurationResolver} from "./GgufInsightsConfigurationResolver.js"; import {GgufInsightsTokens} from "./GgufInsightsTokens.js"; +import type {Promisable} from "../../utils/transformPromisable.js"; +import type {LlamaContextOptions} from "../../evaluator/LlamaContext/types.js"; +import type {AddonContextParams, AddonGgufMetadata, AddonModel, AddonModelParams} from "../../bindings/AddonTypes.js"; export type GgufInsightsResourceRequirements = { cpuRam: number, @@ -20,9 +28,15 @@ export class GgufInsights { /** @internal */ private _totalFileLayers: number | null = null; /** @internal */ private _supportsRanking?: boolean; /** @internal */ private _dominantTensorType?: GgmlType; + /** @internal */ private _addonMetadata?: AddonGgufMetadata; + /** @internal */ public _defaultUseMmap?: boolean; /** @internal */ public readonly _ggufFileInfo: GgufFileInfo; /** @internal */ private readonly _configurationResolver: GgufInsightsConfigurationResolver; /** @internal */ private readonly _tokens: GgufInsightsTokens; + /** @internal */ private readonly _exactModelResourceRequirementsCache = new LruCache(40); + /** @internal */ private readonly _exactContextResourceRequirementsCache = new LruCache(40); + /** @internal */ private readonly _simulationSession: GgufInsightsSimulatorSession; + /** @internal */ private readonly _locks = {}; private constructor(ggufFileInfo: GgufFileInfo, llama: Llama) { this._llama = llama; @@ -31,6 +45,7 @@ export class GgufInsights { this._modelSize = calculateTensorsSize(ggufFileInfo.fullTensorInfo ?? [], llama, true, true); this._configurationResolver = GgufInsightsConfigurationResolver._create(this); this._tokens = GgufInsightsTokens._create(this); + this._simulationSession = this._createSimulatorSession(); } /** @@ -40,16 +55,23 @@ export class GgufInsights { */ public getWarnings(modelFilePath?: string) { const warnings: string[] = []; - const modelFilePathText = (modelFilePath != null && modelFilePath !== "") - ? ` ("${getReadablePath(modelFilePath)}")` - : ""; + const resolvedModelFilePath = modelFilePath || ( + this._ggufFileInfo.source?.type === "path" + ? this._ggufFileInfo.source.path + : undefined + ); + const modelFileSourceText = (resolvedModelFilePath != null && resolvedModelFilePath !== "") + ? ` ("${getReadablePath(resolvedModelFilePath)}")` + : this._ggufFileInfo.source?.type === "uri" + ? ` ("${getReadablePath(this._ggufFileInfo.source.uri)}")` + : ""; if (this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model === "gpt2" && this._ggufFileInfo?.metadata?.tokenizer?.ggml?.model == null ) { // equivalent to the warning in `llama.cpp` under `llm_load_vocab`: "missing pre-tokenizer type, using: 'default'" warnings.push( - `This model file${modelFilePathText} is missing a pre-tokenizer configuration. ` + + `This model file${modelFileSourceText} is missing a pre-tokenizer configuration. ` + "This may cause incorrect tokenization and thus degrade the generation quality. " + "Consider using a newer model or regenerating this GGUF model file" ); @@ -214,8 +236,37 @@ export class GgufInsights { return slidingWindow; } + /** @internal */ + public _getAddonMetadata(): Promisable { + if (this._addonMetadata != null || this._ggufFileInfo.sourceData.length === 0) + return this._addonMetadata; + + return withLock([this._locks, "addonMetadata"], async () => { + if (this._addonMetadata != null) + return this._addonMetadata; + + const initInput: Array = []; + for (const data of this._ggufFileInfo.sourceData) { + if (data.type === "buffer") + initInput.push(data.buffer); + else if (data.type === "path") + initInput.push(data.path); + else + void (data satisfies never); + } + + const addonMetadata = new this._llama._bindings.AddonGgufMetadata(); + await addonMetadata.init(initInput); + this._addonMetadata = addonMetadata; + return addonMetadata; + }); + } + + /** + * @deprecated Use `estimateModelResourceRequirementsV2` instead + */ public estimateModelResourceRequirements({ - gpuLayers, useMmap = this._llama.supportsMmap, gpuSupportsMmap = this._llama.gpuSupportsMmap + gpuLayers, useMmap = this._getUseMmap(), gpuSupportsMmap = this._llama.gpuSupportsMmap }: { gpuLayers: number, useMmap?: boolean, gpuSupportsMmap?: boolean }): GgufInsightsResourceRequirements { @@ -227,10 +278,100 @@ export class GgufInsights { }; } + public async estimateModelResourceRequirementsV2(options: { + gpuLayers: number, useMmap?: boolean, gpuSupportsMmap?: boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + const { + gpuLayers, useMmap = this._getUseMmap(), gpuSupportsMmap = this._llama.gpuSupportsMmap, + + _simulatorSession + } = options; + + try { + const simulationResult = await this._simulateModelResourceUsage({ + gpuLayers, + useMmap, + simulatorSession: _simulatorSession + }); + + if (simulationResult != null) { + if (!useMmap || gpuLayers >= this.totalLayers) + return simulationResult; + + // adjust for the missing mmap simulation implementation + const standardEstimation = this.estimateModelResourceRequirements({ + gpuLayers, + useMmap, + gpuSupportsMmap + }); + + return { + gpuVram: Math.max(simulationResult.gpuVram, standardEstimation.gpuVram), + cpuRam: Math.min(simulationResult.cpuRam, standardEstimation.cpuRam) + }; + } + } catch (error: any) { + this._llama._log(LlamaLogLevel.warn, error?.message ?? String(error)); + } + + return this.estimateModelResourceRequirements({ + gpuLayers, + useMmap, + gpuSupportsMmap + }); + } + + /** @internal */ + public async _simulateModelResourceUsage({ + gpuLayers, + useMmap = this._getUseMmap(), + simulatorSession = this._simulationSession + }: { + gpuLayers: number, + useMmap?: boolean, + simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + const cacheKey = [gpuLayers, Number(useMmap)].join(":"); + const cachedValue = this._exactModelResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const lock = await acquireLock([this._locks, "_simulateModelResourceUsage", cacheKey]); + try { + const cachedValue = this._exactModelResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const simulatorSource = await this._resolveSimulatorSource(); + if (simulatorSource == null) + return null; + + let resourceRequirements: GgufInsightsResourceRequirements; + try { + resourceRequirements = await simulatorSession.estimateModelResources({ + modelSource: simulatorSource, + gpuLayers, + useMmap + }); + } catch (error: any) { + throw new Error("Failed simulating model resource usage. Falling back to estimation heuristic. Error: " + (error?.message ?? String(error))); + } + + this._exactModelResourceRequirementsCache.set(cacheKey, resourceRequirements); + return {...resourceRequirements}; + } finally { + lock.dispose(); + } + } + /** * Estimates the memory required to create a context of the given parameters based on the implementation details of `llama.cpp`. * The calculation doesn't include a precise estimation of the graph overhead memory, so it uses a rough estimate for that. * The estimation for the graph overhead memory will be improved in the future to be more precise, but it's good enough for now. + * @deprecated Use `estimateContextResourceRequirementsV2` instead */ public estimateContextResourceRequirements({ contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, includeGraphOverhead = true, flashAttention = false, @@ -248,29 +389,48 @@ export class GgufInsights { const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const slidingWindow = this.swaSize ?? 0; const kvUnified = false; - const usingSWA = !swaFullCache && slidingWindow > 0 && slidingWindow < contextSize && + const totalFileLayers = this._getTotalFileLayers(); + const hasSwaAttention = slidingWindow > 0; + const usingReducedSWA = hasSwaAttention && !swaFullCache && slidingWindow < contextSize && (this.trainContextSize == null || slidingWindow < this.trainContextSize); - const swaPattern = getSwaPatternForArchitecture( - this._ggufFileInfo.metadata?.general?.architecture, - this._ggufFileInfo.architectureMetadata?.attention?.sliding_window_pattern - ); - const nonSwaPercent = swaPattern <= 1 - ? 1 - : (1 / (swaPattern + (flashAttention ? -0.5 : -1))); + let graphRelevantTensorCount = 0; + let graphRelevantTensorElements = 0; + let totalTensorElements = 0; + + for (const singleTensorInfo of tensorInfo) { + let tensorElements = 0; + for (const dim of singleTensorInfo.dimensions) + tensorElements += Number(dim); + + totalTensorElements += tensorElements; - // source: `llama_kv_cache_unified::get_padding` in `llama-kv-cache.cpp` - const kvCachePadding = 1; + if (!isGraphRelevantTensor(singleTensorInfo.name)) + continue; + + graphRelevantTensorCount++; + graphRelevantTensorElements += tensorElements; + } + + const effectiveGraphTensorCount = graphRelevantTensorCount > 0 + ? graphRelevantTensorCount + : tensorInfo.length; + const effectiveGraphTensorElements = graphRelevantTensorCount > 0 + ? graphRelevantTensorElements + : totalTensorElements; + + const paddedContextSize = padSafeContextSize(contextSize, "up"); const actualContextSize = kvUnified ? padSafeContextSize(sequences * contextSize, "up") - : sequences * padSafeContextSize(contextSize, "up"); - const kvSize = usingSWA - ? ( - (1 - nonSwaPercent) * Math.min(actualContextSize, ggmlPad(sequences * slidingWindow + batchSize, kvCachePadding)) + - nonSwaPercent * actualContextSize - ) - : actualContextSize; - - const totalFileLayers = this._getTotalFileLayers(); + : sequences * paddedContextSize; + const fullAttentionKvSize = actualContextSize; + const swaBatchSize = hasSwaAttention && !swaFullCache + ? batchSize + 1 + : batchSize; + const swaKvSize = !hasSwaAttention + ? actualContextSize + : !usingReducedSWA + ? actualContextSize + : Math.min(actualContextSize, ggmlPad((sequences * slidingWindow) + swaBatchSize, 256)); const totalLayersIncludingOutput = totalFileLayers + 1; const finalModelGpuLayers = Math.max( 0, @@ -284,13 +444,17 @@ export class GgufInsights { gpuKVCacheSize, cpuKVCacheSize, gpuRecurrentStateSize, - cpuRecurrentStateSize + cpuRecurrentStateSize, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv } = this._estimateContextCacheMemorySplitInBytes({ - kvSize, + fullAttentionKvSize, + swaKvSize, sequences, totalFileLayers, finalModelGpuLayers, usingGpu, + flashAttention, kvCacheKeyType, kvCacheValueType }); @@ -324,37 +488,82 @@ export class GgufInsights { const estimateGraphOverheadMemory = (): number => { const s1MB = Math.pow(1024, 2); - const tensorInfo = this._ggufFileInfo.fullTensorInfo ?? []; const expertCount = llmData?.expert_count ?? 0; const headCount = llmData?.attention?.head_count ?? 0; const embeddingLength = llmData?.embedding_length ?? 0; + const activeGraphTokens = roundUpToMultiple( + Math.max(1, Math.min(paddedContextSize, batchSize)), + Math.max(1, sequences) + ); + const graphContextSize = resolveGraphContextSizeForOverheadEstimation({ + fullAttentionKvSize, + trainContextSize: this.trainContextSize, + flashAttention, + headCount, + batchSize, + paddedContextSize, + sequences + }); let defaultCalculationAdjustment = 0; + const totalElements = effectiveGraphTensorCount === 0 + ? this.totalLayers * ( + ( + (llmData.embedding_length ?? 0) + + (llmData.feed_forward_length ?? 0) + ) / 2 + ) + : effectiveGraphTensorElements; + const tensorBasedGraphOverhead = (tensorElementMultiplier: number) => ( + (totalElements * tensorElementMultiplier * (graphContextSize / 4096)) + defaultCalculationAdjustment + ); + const batchLocalTensorBasedGraphOverhead = (tensorElementMultiplier: number) => ( + (totalElements * tensorElementMultiplier * (activeGraphTokens / 4096)) + defaultCalculationAdjustment + ); if (batchSize == null) return 0; + const genericNonFlashAttentionWorkspaceEstimate = !flashAttention + ? estimateNonFlashAttentionWorkspace({ + trainContextSize: this.trainContextSize, + fullAttentionKvSize, + swaKvSize, + hasSwaAttention, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv, + activeGraphTokens, + headCount + }) + : 0; + if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.llama) { if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; - return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); } - return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (embeddingLength + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen2) { if (modelGpuLayers === this.totalLayers) { defaultCalculationAdjustment -= (s1MB * 340) * ( this.trainContextSize == null ? 1 - : kvSize / this.trainContextSize + : graphContextSize / this.trainContextSize ); } else { defaultCalculationAdjustment -= (s1MB * 250) + ( (s1MB * 50) * ( this.trainContextSize == null ? 1 - : kvSize / this.trainContextSize + : graphContextSize / this.trainContextSize ) ); } @@ -367,7 +576,7 @@ export class GgufInsights { (s1MB * 270) * ( this.trainContextSize == null ? 1 - : kvSize / this.trainContextSize + : graphContextSize / this.trainContextSize ) ); } else { @@ -375,14 +584,25 @@ export class GgufInsights { (s1MB * 150) * ( this.trainContextSize == null ? 1 - : Math.max(0, (1 - (kvSize / this.trainContextSize))) + : Math.max(0, (1 - (graphContextSize / this.trainContextSize))) ) ); } + } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma3) { + const trainContextSize = Math.max(1, this.trainContextSize ?? graphContextSize); + const contextRatio = Math.min(1, Math.max(0, graphContextSize / trainContextSize)); + + return Math.max( + int32TBytes * batchSize * graphContextSize * headCount * (0.08 + Math.pow(contextRatio, 2)), + genericNonFlashAttentionWorkspaceEstimate + ); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.stablelm) { const headCount = this._ggufFileInfo.architectureMetadata.attention?.head_count ?? 0; - return (int32TBytes * batchSize * kvSize * headCount) - (50 * s1MB); + return Math.max( + (int32TBytes * batchSize * graphContextSize * headCount) - (50 * s1MB), + genericNonFlashAttentionWorkspaceEstimate + ); // if (modelGpuLayers === this.totalLayers) { // defaultCalculationAdjustment += -(s1MB * 20) + ( @@ -402,34 +622,54 @@ export class GgufInsights { // ); // } } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.qwen3) { - return int32TBytes * batchSize * (embeddingLength + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (embeddingLength + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); + } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.gemma4) { + const trainContextSize = Math.max(1, this.trainContextSize ?? graphContextSize); + const contextRatio = Math.min(1, Math.max(0, graphContextSize / trainContextSize)); + const gemma4DenseShortContextScale = 0.4; + const gemma4DenseContextScaleExponent = 3; + const gemma4DenseEstimate = int32TBytes * batchSize * graphContextSize * headCount * + (gemma4DenseShortContextScale + Math.pow(contextRatio, gemma4DenseContextScaleExponent)); + + if (expertCount > 0) { + const tensorBasedEstimate = tensorBasedGraphOverhead(77.655); + const moeBlendWeight = Math.sqrt(contextRatio); + + return Math.max( + gemma4DenseEstimate, + (gemma4DenseEstimate + ((tensorBasedEstimate - gemma4DenseEstimate) * moeBlendWeight)) * 1.01, + genericNonFlashAttentionWorkspaceEstimate + ); + } + + return Math.max(gemma4DenseEstimate, genericNonFlashAttentionWorkspaceEstimate); } else if (expertCount > 0) { const expertsUsedCount = this._ggufFileInfo.architectureMetadata.expert_used_count ?? 2; - return int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (kvSize * headCount)); + return Math.max( + int32TBytes * batchSize * (((expertsUsedCount + 1) * embeddingLength) + (graphContextSize * headCount)), + genericNonFlashAttentionWorkspaceEstimate + ); } - const totalElements = tensorInfo.length === 0 - ? this.totalLayers * ( - ( - (llmData.embedding_length ?? 0) + - (llmData.feed_forward_length ?? 0) - ) / 2 - ) - : tensorInfo.reduce((res, tensor) => { - return res + tensor.dimensions.reduce((res: number, dim) => res + Number(dim), 0); - }, 0); - if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.phi3) { // magic numbers for estimation. will be improved in the future - return (totalElements * 123 * (kvSize / 4096)) + defaultCalculationAdjustment; + return Math.max(tensorBasedGraphOverhead(123), genericNonFlashAttentionWorkspaceEstimate); } else if (this._ggufFileInfo.metadata.general?.architecture === GgufArchitectureType.cohere2) { // magic numbers for estimation. will be improved in the future - return (totalElements * 148 * (kvSize / 4096)) + defaultCalculationAdjustment; + return Math.max(tensorBasedGraphOverhead(148), genericNonFlashAttentionWorkspaceEstimate); } // magic numbers for estimation. will be improved in the future - return (totalElements * 77.655 * (kvSize / 4096)) + defaultCalculationAdjustment; + return Math.max( + !flashAttention + ? batchLocalTensorBasedGraphOverhead(77.655) + : tensorBasedGraphOverhead(77.655), + genericNonFlashAttentionWorkspaceEstimate + ); }; // source: `llama_context::graph_max_nodes` in `llama-context.cpp` @@ -449,10 +689,10 @@ export class GgufInsights { this._ggufFileInfo.metadata?.general?.architecture, Math.min(actualContextSize, batchSize) ); - const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * tensorInfo.length); + const maxNodes = Math.max(maxNodesMultiplier.min, maxNodesMultiplier.multiplier * effectiveGraphTensorCount); const cpuNodes = totalFileLayers === 0 ? 0 - : maxNodesMultiplier.multiplier * (tensorInfo.length * (finalCpuLayers / totalFileLayers)); + : maxNodesMultiplier.multiplier * (effectiveGraphTensorCount * (finalCpuLayers / totalFileLayers)); const gpuNodes = maxNodes - cpuNodes; const gpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * gpuNodes) + @@ -460,7 +700,7 @@ export class GgufInsights { const cpuComputeBufferSize = (this._llama._consts.ggmlTensorOverhead * cpuNodes) + this._llama._bindings.getGgmlGraphOverheadCustom(cpuNodes, false); - const graphOverheadMemory = (flashAttention || !includeGraphOverhead) + const graphOverheadMemory = !includeGraphOverhead ? 0 : estimateGraphOverheadMemory(); const graphOverheadGpuSize = (usingGpu && totalFileLayers > 0) @@ -481,6 +721,136 @@ export class GgufInsights { }; } + public async estimateContextResourceRequirementsV2(options: { + contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], swaFullCache?: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, + useMmap?: boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + const { + contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, flashAttention = "auto", + swaFullCache = false, + kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16, + useMmap, + + _simulatorSession + } = options; + + try { + const simulationResult = await this._simulateContextResourceUsage({ + contextSize, + modelGpuLayers, + batchSize, + sequences, + isEmbeddingContext, + flashAttention, + swaFullCache, + useMmap, + simulatorSession: _simulatorSession, + kvCacheKeyType, + kvCacheValueType + }); + if (simulationResult != null) + return simulationResult; + } catch (error: any) { + this._llama._log(LlamaLogLevel.warn, error?.message ?? String(error)); + } + + return this.estimateContextResourceRequirements({ + contextSize, + modelGpuLayers, + batchSize, + sequences, + isEmbeddingContext, + flashAttention: flashAttention === true, + swaFullCache, + kvCacheKeyType, + kvCacheValueType + }); + } + + /** @internal */ + public _getUseMmap(useMmapOption?: boolean) { + return useMmapOption ?? this._defaultUseMmap ?? this._llama.supportsMmap; + } + + /** @internal */ + public async _simulateContextResourceUsage({ + contextSize, modelGpuLayers, batchSize, sequences, isEmbeddingContext = false, flashAttention = "auto", + swaFullCache = false, useMmap = this._getUseMmap(), + kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16, + simulatorSession = this._simulationSession + }: { + contextSize: number, modelGpuLayers: number, batchSize?: number, sequences?: number, isEmbeddingContext?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], swaFullCache?: boolean, useMmap?: boolean, + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, + simulatorSession?: GgufInsightsSimulatorSession + }): Promise { + if (sequences == null) sequences = getDefaultContextSequences(); + if (batchSize == null) batchSize = getDefaultContextBatchSize({contextSize, sequences}); + + const cacheKey = [ + contextSize, + modelGpuLayers, + batchSize, + sequences, + Number(isEmbeddingContext), + flashAttention === "auto" + ? "auto" + : String(flashAttention), + Number(swaFullCache), + Number(useMmap), + kvCacheKeyType, + kvCacheValueType + ].join(":"); + const cachedValue = this._exactContextResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const lock = await acquireLock([this._locks, "_simulateContextResourceUsage", cacheKey]); + try { + const cachedValue = this._exactContextResourceRequirementsCache.get(cacheKey); + if (cachedValue != null) + return {...cachedValue}; + + const simulatorSource = await this._resolveSimulatorSource(); + if (simulatorSource == null) + return null; + + let contextResources: GgufInsightsResourceRequirements; + try { + contextResources = await simulatorSession.estimateContextResources({ + modelSource: simulatorSource, + gpuLayers: modelGpuLayers, + contextSize, + batchSize, + sequences, + isEmbeddingContext, + flashAttention, + swaFullCache, + useMmap, + kvCacheKeyType, + kvCacheValueType + }); + } catch (error: any) { + throw new Error("Failed simulating context resource usage. Falling back to estimation heuristic. Error: " + (error?.message ?? String(error))); + } + + const resourceRequirements = { + cpuRam: contextResources.cpuRam, + gpuVram: contextResources.gpuVram + } satisfies GgufInsightsResourceRequirements; + + this._exactContextResourceRequirementsCache.set(cacheKey, resourceRequirements); + return {...resourceRequirements}; + } finally { + lock.dispose(); + } + } + /** * Get the split tensor resources for CPU and GPU based on the number of GPU layers * @internal @@ -500,7 +870,7 @@ export class GgufInsights { } const fileLayers = this._getFileLayers(); - const startGpuLayer = Math.max(0, fileLayers - gpuLayers); + const startGpuLayer = Math.max(0, fileLayers - gpuLayers + 1); const gpuTensors: GgufTensorInfo[] = []; const cpuTensors: GgufTensorInfo[] = []; @@ -523,7 +893,7 @@ export class GgufInsights { // in the implementation of `llm_load_tensors`, layers with `LLM_TENSOR_LAYER_OUTPUT` are always // loaded with `model.dev_output`, which is set to the GPU only if all the layers are on the GPU } else if (isOutputLayer(singleTensorInfo.name)) { - if (gpuLayers === this.totalLayers) { + if (gpuLayers > 0) { gpuTensors.push(singleTensorInfo); continue; } else { @@ -580,19 +950,23 @@ export class GgufInsights { } private _estimateContextCacheMemorySplitInBytes({ - kvSize, + fullAttentionKvSize, + swaKvSize, sequences, totalFileLayers, finalModelGpuLayers, usingGpu, + flashAttention, kvCacheKeyType = GgmlType.F16, kvCacheValueType = GgmlType.F16 }: { - kvSize: number, + fullAttentionKvSize: number, + swaKvSize: number, sequences: number, totalFileLayers: number, finalModelGpuLayers: number, usingGpu: boolean, + flashAttention: boolean, kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType }) { @@ -603,8 +977,24 @@ export class GgufInsights { const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nHeadKv: number | number[] = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead; const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); + const nEmbdHeadKSwa = this._ggufFileInfo.architectureMetadata.attention?.key_length_swa; + const nEmbdHeadVSwa = this._ggufFileInfo.architectureMetadata.attention?.value_length_swa; + const sharedKvLayers = this._ggufFileInfo.architectureMetadata.attention?.shared_kv_layers; + const slidingWindowPattern = this._ggufFileInfo.architectureMetadata.attention?.sliding_window_pattern; const keyTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheKeyType) ?? this._llama._consts.ggmlTypeF16Size; const valueTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheValueType) ?? this._llama._consts.ggmlTypeF16Size; + const nHeadKvValues = nHeadKv as unknown; + let maxLayerValueEmbedding = 0; + + if (!flashAttention && nHeadKvValues instanceof Array) { + for (let i = 0; i < totalFileLayers; i++) { + const layerHeadCountKv = resolveLayerHeadCountKv(nHeadKvValues, i, nHead); + const isSwaLayer = isSwaLayerAtIndex(architecture, slidingWindowPattern, i); + const layerValueEmbedding = resolveLayerHeadDimension(nEmbdHeadV, nEmbdHeadVSwa, isSwaLayer) * layerHeadCountKv; + + maxLayerValueEmbedding = Math.max(maxLayerValueEmbedding, layerValueEmbedding); + } + } // source: `llama_model::load_tensors` in `llama-model.cpp` // repeating layers are assigned to GPU from `i_gpu_start = n_layer + 1 - n_gpu_layers` @@ -618,6 +1008,8 @@ export class GgufInsights { let cpuKvElementsV = 0; let gpuRecurrentLayers = 0; let cpuRecurrentLayers = 0; + let maxAttentionLayerKvSize = 0; + let maxAttentionLayerHeadCountKv = 0; for (let i = 0; i < totalFileLayers; i++) { const isGpuLayer = i >= gpuRepeatingLayerStart; @@ -629,9 +1021,22 @@ export class GgufInsights { else cpuRecurrentLayers++; } else { + if (!doesLayerOwnKvCache(totalFileLayers, i, sharedKvLayers)) + continue; + const nHeadKvLayer = resolveLayerHeadCountKv(nHeadKv, i, nHead); - const layerElementsK = nEmbdHeadK * nHeadKvLayer * kvSize; - const layerElementsV = nEmbdHeadV * nHeadKvLayer * kvSize; + const isSwaLayer = isSwaLayerAtIndex(architecture, slidingWindowPattern, i); + const layerKvSize = isSwaLayer + ? swaKvSize + : fullAttentionKvSize; + maxAttentionLayerKvSize = Math.max(maxAttentionLayerKvSize, layerKvSize); + maxAttentionLayerHeadCountKv = Math.max(maxAttentionLayerHeadCountKv, nHeadKvLayer); + const layerElementsK = resolveLayerHeadDimension(nEmbdHeadK, nEmbdHeadKSwa, isSwaLayer) * nHeadKvLayer * layerKvSize; + const layerElementsV = layerKvSize * ( + maxLayerValueEmbedding > 0 + ? maxLayerValueEmbedding + : (resolveLayerHeadDimension(nEmbdHeadV, nEmbdHeadVSwa, isSwaLayer) * nHeadKvLayer) + ); if (isGpuLayer) { gpuKvElementsK += layerElementsK; @@ -658,7 +1063,9 @@ export class GgufInsights { gpuKVCacheSize, cpuKVCacheSize, gpuRecurrentStateSize, - cpuRecurrentStateSize + cpuRecurrentStateSize, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv }; } @@ -726,6 +1133,23 @@ export class GgufInsights { return this._totalFileLayers; } + /** @internal */ + private async _resolveSimulatorSource(): Promise { + const addonMetadata = await this._getAddonMetadata(); + if (addonMetadata != null) + return addonMetadata; + + if (this._ggufFileInfo.source?.type === "path") + return this._ggufFileInfo.source.path; + + return null; + } + + /** @internal */ + public _createSimulatorSession(lruCacheSize: number = 10) { + return new GgufInsightsSimulatorSession(this._llama, lruCacheSize); + } + /** * @param ggufFileInfo * @param llama - If you already have a `Llama` instance, pass it to reuse it for the `GgufInsights` instance. @@ -742,6 +1166,181 @@ export class GgufInsights { } } +export class GgufInsightsSimulatorSession { + private readonly _llama: Llama; + private readonly _modelPromises: LruCache>; + private _disposed = false; + + public constructor(llama: Llama, lruCacheSize: number = 10) { + this._llama = llama; + this._modelPromises = new LruCache(lruCacheSize); + } + + public async estimateModelResources({ + modelSource, + gpuLayers, + useMmap = false + }: { + modelSource: string | AddonGgufMetadata, + gpuLayers: number, + useMmap?: boolean + }): Promise { + const model = await this._getModel({source: modelSource, gpuLayers, useMmap}); + const memoryBreakdown = model.getMemoryBreakdown(); + if (this._llama._shouldLog(LlamaLogLevel.debug)) + this._llama._log(LlamaLogLevel.debug, "Simulating model resource usage. " + [ + `gpuLayers=${gpuLayers}`, + `useMmap=${useMmap}`, + `memoryBreakdownCpuRam=${bytes(memoryBreakdown.cpuRam)}`, + `memoryBreakdownGpuVram=${bytes(memoryBreakdown.gpuVram)}` + ].join(" ")); + return memoryBreakdown; + } + + public async estimateContextResources({ + modelSource, + gpuLayers, + contextSize, + batchSize, + sequences, + isEmbeddingContext = false, + flashAttention = "auto", + swaFullCache = false, + useMmap = false, + kvCacheKeyType = GgmlType.F16, + kvCacheValueType = GgmlType.F16 + }: { + modelSource: string | AddonGgufMetadata, + gpuLayers: number, + contextSize: number, + batchSize: number, + sequences: number, + isEmbeddingContext?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], + swaFullCache?: boolean, + useMmap?: boolean, + kvCacheKeyType?: GgmlType, + kvCacheValueType?: GgmlType + }): Promise { + const model = await this._getModel({source: modelSource, gpuLayers, useMmap}); + const context = new this._llama._bindings.AddonContext(model, removeUndefinedFields({ + contextSize, + batchSize, + sequences, + embeddings: isEmbeddingContext, + flashAttention: flashAttention === "auto" + ? "auto" + : flashAttention, + kvCacheKeyType, + kvCacheValueType, + swaFullCache + } satisfies AddonContextParams)); + + try { + const contextLoaded = await context.init(); + if (!contextLoaded) + throw new Error("Failed to create context"); + + const memoryBreakdown = context.getMemoryBreakdown(); + if (this._llama._shouldLog(LlamaLogLevel.debug)) + this._llama._log(LlamaLogLevel.debug, "Simulating context resource usage. " + [ + `gpuLayers=${gpuLayers}`, + `contextSize=${contextSize.toLocaleString("en-US", {notation: "compact"})}`, + `batchSize=${batchSize}`, + `sequences=${sequences}`, + `isEmbeddingContext=${isEmbeddingContext}`, + `flashAttention=${flashAttention}`, + `swaFullCache=${swaFullCache}`, + `kvCacheKeyType=${kvCacheKeyType}`, + `kvCacheValueType=${kvCacheValueType}`, + `useMmap=${useMmap}`, + `memoryBreakdownCpuRam=${bytes(memoryBreakdown.cpuRam)}`, + `memoryBreakdownGpuVram=${bytes(memoryBreakdown.gpuVram)}` + ].join(" ")); + return memoryBreakdown; + } finally { + await context.dispose(); + } + } + + public [Symbol.asyncDispose]() { + return this.dispose(); + } + + public async dispose() { + if (this._disposed) + return; + + this._disposed = true; + + const modelPromises = [...this._modelPromises.values()].map((modelPromise) => modelPromise.catch(() => void 0)); + this._modelPromises.clear(); + const loadedModels = (await Promise.all(modelPromises)).filter((model) => model != null); + + await Promise.all(loadedModels.map((model) => model.dispose().catch(() => void 0))); + } + + private async _getModel({ + source, + gpuLayers, + useMmap = this._llama.supportsMmap + }: { + source: string | AddonGgufMetadata, + gpuLayers: number, + useMmap?: boolean + }) { + if (this._disposed) + throw new Error("simulator session is disposed"); + + const cacheKey = String(gpuLayers) + ":" + String(useMmap); + const existingModelPromise = this._modelPromises.get(cacheKey); + if (existingModelPromise != null) + return await existingModelPromise; + + if (this._llama._shouldLog(LlamaLogLevel.debug)) + this._llama._log(LlamaLogLevel.debug, `Loading model for simulator session. gpuLayers=${gpuLayers} useMmap=${useMmap}`); + const modelPromise = this._loadModel({ + source, + gpuLayers, + useMmap + }); + this._modelPromises.set(cacheKey, modelPromise); + + try { + return await modelPromise; + } catch (error) { + this._modelPromises.delete(cacheKey); + throw error; + } + } + + private async _loadModel({ + source, gpuLayers, useMmap = false + }: { + source: string | AddonGgufMetadata, gpuLayers: number, useMmap?: boolean + }) { + const model = new this._llama._bindings.AddonModel( + typeof source === "string" + ? source + : "", + removeNullFields({ + gpuLayers, + noAlloc: true, + useMmap, + useMlock: false + } satisfies AddonModelParams) + ); + + const modelLoaded = typeof source === "string" + ? await model.init() + : await model.init(source); + if (!modelLoaded) + throw new Error("Failed to load model"); + + return model; + } +} + function parseTensorName(tensorName?: string): { layerNumber: number | undefined } { @@ -941,49 +1540,212 @@ function isTokenEmbedLayer(layerName: string) { return firstPart === "token_embd"; } +function isGraphRelevantTensor(tensorName: string): boolean { + return isInputLayer(tensorName) || + isOutputLayer(tensorName) || + tensorName.startsWith("blk.") || + tensorName.startsWith("enc.blk.") || + tensorName.startsWith("dec.blk."); +} + function ggmlPad(value: number, padding: number): number { return ((value + padding - 1) & ~(padding - 1)); } -function getSwaPatternForArchitecture(architecture?: GgufArchitectureType, slidingWindowPattern?: number | number[]): number { - if (typeof slidingWindowPattern === "number") - return slidingWindowPattern; +function roundUpToMultiple(value: number, multiple: number): number { + if (multiple <= 1) + return value; + + return Math.ceil(value / multiple) * multiple; +} + +function resolveGraphContextSizeForOverheadEstimation({ + fullAttentionKvSize, + trainContextSize, + flashAttention, + headCount, + batchSize, + paddedContextSize, + sequences +}: { + fullAttentionKvSize: number, + trainContextSize: number | undefined, + flashAttention: boolean, + headCount: number, + batchSize: number, + paddedContextSize: number, + sequences: number +}) { + // heuristic coefficients fit to estimate llama.cpp graph-reserve behavior + const flashAttentionMinContextMultiplier = 0.5; + const flashAttentionMaxContextMultiplier = 0.78; + const flashAttentionMinHeadCountForScaling = 4; + const flashAttentionContextRatioLog2Cap = 2; + const flashAttentionContextRatioLog2Scale = 0.05; + const longContextOverflowStartRatio = 1.25; + const longContextOverflowGrowthScale = 0.1; + const longContextMaxMultiplierIncrease = 0.4; + + const normalizedTrainContextSize = trainContextSize == null || trainContextSize <= 0 + ? Math.max(1, fullAttentionKvSize) + : trainContextSize; + const contextRatio = Math.max(1, fullAttentionKvSize / normalizedTrainContextSize); + + if (flashAttention) { + const activeGraphTokens = roundUpToMultiple( + Math.max(1, Math.min(paddedContextSize, batchSize)), + Math.max(1, sequences) + ); + const flashContextMultiplierBase = + flashAttentionMinContextMultiplier + (1 / Math.max(flashAttentionMinHeadCountForScaling, headCount)); + const flashContextMultiplierLongContextAdjustment = + Math.min(flashAttentionContextRatioLog2Cap, Math.log2(contextRatio)) * flashAttentionContextRatioLog2Scale; + const flashContextMultiplier = Math.max( + flashAttentionMinContextMultiplier, + Math.min( + flashAttentionMaxContextMultiplier, + flashContextMultiplierBase + flashContextMultiplierLongContextAdjustment + ) + ); + + return activeGraphTokens * flashContextMultiplier; + } + + const contextOverflow = Math.max(0, contextRatio - longContextOverflowStartRatio); + const longContextMultiplier = 1 + Math.min( + longContextMaxMultiplierIncrease, + longContextOverflowGrowthScale * contextOverflow * contextOverflow + ); + + return fullAttentionKvSize * longContextMultiplier; +} + +function estimateNonFlashAttentionWorkspace({ + trainContextSize, + fullAttentionKvSize, + swaKvSize, + hasSwaAttention, + maxAttentionLayerKvSize, + maxAttentionLayerHeadCountKv, + activeGraphTokens, + headCount +}: { + trainContextSize: number | undefined, + fullAttentionKvSize: number, + swaKvSize: number, + hasSwaAttention: boolean, + maxAttentionLayerKvSize: number, + maxAttentionLayerHeadCountKv: number, + activeGraphTokens: number, + headCount: number +}) { + const floatBytes = 4; // sizeof(float) + const strongGqaMaxKvToQHeadRatio = 0.5; + const minAttentionScoreWorkspaceScale = 0.4; + const additionalAttentionScoreWorkspaceScale = 0.6; + + if (maxAttentionLayerKvSize <= 0 || activeGraphTokens <= 0 || headCount <= 0) + return 0; + + const attentionScoresWorkspace = floatBytes * activeGraphTokens * maxAttentionLayerKvSize * headCount; + const attentionMaskWorkspace = floatBytes * activeGraphTokens * ( + hasSwaAttention + ? fullAttentionKvSize + swaKvSize + : maxAttentionLayerKvSize + ); + + if (!hasSwaAttention) + // source: non-FA reserve path in `llm_graph_context::build_attn_mha` + `build_attn_inp_kq_mask` in `llama-graph.cpp` + // reserves the full KQ tensor and the matching F32 attention mask for the ubatch-local graph + return attentionScoresWorkspace + attentionMaskWorkspace; + + // the explicit KQ workspace floor matches the non-FA reserve path well for MHA-like layouts, + // but it becomes too aggressive for strong GQA / MQA hybrid models where KV heads are much fewer than Q heads + if (maxAttentionLayerHeadCountKv / headCount < strongGqaMaxKvToQHeadRatio) + return attentionMaskWorkspace; + + const normalizedTrainContextSize = Math.max(1, trainContextSize ?? maxAttentionLayerKvSize); + const contextRatio = Math.min(1, Math.max(0, maxAttentionLayerKvSize / normalizedTrainContextSize)); + const attentionScoreWorkspaceScale = + minAttentionScoreWorkspaceScale + (additionalAttentionScoreWorkspaceScale * contextRatio); + + return (attentionScoresWorkspace * attentionScoreWorkspaceScale) + attentionMaskWorkspace; +} + +function isSwaLayerAtIndex( + architecture: GgufArchitectureType | undefined, + slidingWindowPattern: number | number[] | undefined, + layerIndex: number +): boolean { + if (layerIndex < 0) + return false; + + if (slidingWindowPattern instanceof Array) + return Boolean(slidingWindowPattern[layerIndex]); + const [defaultPattern, denseFirst] = getSwaPatternForArchitecture(architecture); + const pattern = typeof slidingWindowPattern === "number" + ? Math.max(0, Math.floor(slidingWindowPattern)) + : defaultPattern; + + if (pattern === 0) + return true; + + return denseFirst + ? (layerIndex % pattern !== 0) + : (layerIndex % pattern < (pattern - 1)); +} + +function getSwaPatternForArchitecture(architecture?: GgufArchitectureType): [pattern: number, denseFirst: boolean] { // source: `llama_model::load_hparams` in `llama-model.cpp` - calls to `hparams.set_swa_pattern` switch (architecture) { case GgufArchitectureType.llama4: - return 4; + return [4, false]; case GgufArchitectureType.afmoe: - return 4; + return [4, false]; case GgufArchitectureType.modernBert: - return 3; + return [3, true]; case GgufArchitectureType.phi3: - return 1; + return [1, false]; case GgufArchitectureType.plamo3: - return 8; + return [8, false]; case GgufArchitectureType.gemma2: - return 2; + return [2, false]; case GgufArchitectureType.gemma3: - return 6; + return [6, false]; case GgufArchitectureType.gemma3n: - return 5; + return [5, false]; case GgufArchitectureType.gemmaEmbedding: - return 6; + return [6, false]; case GgufArchitectureType.cohere2: - return 4; + return [4, false]; case GgufArchitectureType.olmo2: - return 4; + return [4, false]; case GgufArchitectureType.exaone4: - return 4; + return [4, false]; case GgufArchitectureType.exaoneMoe: - return 4; + return [4, false]; case GgufArchitectureType.gptOss: - return 2; + return [2, false]; case GgufArchitectureType.smallthinker: - return 4; + return [4, true]; } - return 1; + return [1, false]; +} + +function resolveLayerHeadDimension(defaultValue: number, swaValue: number | undefined, isSwaLayer: boolean): number { + if (isSwaLayer && swaValue != null) + return swaValue; + + return defaultValue; +} + +function doesLayerOwnKvCache(totalLayers: number, layerIndex: number, sharedKvLayers: number | undefined): boolean { + if (sharedKvLayers == null || sharedKvLayers <= 0) + return true; + + return layerIndex < Math.max(0, totalLayers - sharedKvLayers); } function resolveLayerHeadCountKv(nHeadKv: number | number[], layerIndex: number, nHead: number): number { @@ -1007,8 +1769,9 @@ function getRecurrentLayersPattern( architectureMetadata: GgufFileInfo["architectureMetadata"] ): RecurrentLayersPattern { const nHeadKv = architectureMetadata?.attention?.head_count_kv; + const nHeadKvValues: number | number[] | undefined = nHeadKv; const feedForwardLength = architectureMetadata?.feed_forward_length as number | number[] | undefined; - const hasRecurrentHeadCountKvEntry = Array.isArray(nHeadKv) && nHeadKv.some((value) => value === 0); + const hasRecurrentHeadCountKvEntry = nHeadKvValues instanceof Array && nHeadKvValues.some((value: number) => value === 0); if (architecture === GgufArchitectureType.falconH1) // source: `llama_model::load_hparams` in `llama-model.cpp`: @@ -1019,10 +1782,10 @@ function getRecurrentLayersPattern( // source: `llama_model::load_hparams` in `llama-model.cpp`: // `case LLM_ARCH_NEMOTRON_H / LLM_ARCH_NEMOTRON_H_MOE`: // `recurrent_layer_arr[i] = (n_head_kv(i) == 0 && n_ff(i) == 0)` - if (Array.isArray(nHeadKv)) + if (nHeadKvValues instanceof Array) return { type: "headCountKvAndFeedForward", - headCountKvValues: nHeadKv, + headCountKvValues: nHeadKvValues, feedForwardLength }; @@ -1055,10 +1818,10 @@ function getRecurrentLayersPattern( interval: Math.max(1, Math.floor(architectureMetadata?.full_attention_interval)) }; - if (hasRecurrentHeadCountKvEntry) + if (nHeadKvValues instanceof Array && hasRecurrentHeadCountKvEntry) return { type: "headCountKvArray", - values: nHeadKv + values: nHeadKvValues }; return "none"; @@ -1081,7 +1844,7 @@ function isLayerRecurrent(pattern: RecurrentLayersPattern, layerIndex: number): function resolveLayerFeedForwardLength(feedForwardLength: number | number[] | undefined, layerIndex: number): number { if (typeof feedForwardLength === "number") return feedForwardLength; - else if (Array.isArray(feedForwardLength)) + else if (feedForwardLength instanceof Array) return feedForwardLength[layerIndex] ?? 0; return 0; diff --git a/src/gguf/insights/GgufInsightsConfigurationResolver.ts b/src/gguf/insights/GgufInsightsConfigurationResolver.ts index ea41dcfa..6f10dec8 100644 --- a/src/gguf/insights/GgufInsightsConfigurationResolver.ts +++ b/src/gguf/insights/GgufInsightsConfigurationResolver.ts @@ -8,7 +8,7 @@ import {resolveModelGpuLayersOption} from "./utils/resolveModelGpuLayersOption.j import {resolveContextContextSizeOption} from "./utils/resolveContextContextSizeOption.js"; import {scoreLevels} from "./utils/scoreLevels.js"; import {getRamUsageFromUnifiedVram} from "./utils/getRamUsageFromUnifiedVram.js"; -import type {GgufInsights} from "./GgufInsights.js"; +import type {GgufInsights, GgufInsightsSimulatorSession} from "./GgufInsights.js"; export const defaultTrainContextSizeForEstimationPurposes = 4096; const defaultContextSizeForUnfitContextSizeConfiguration = 2048; @@ -39,20 +39,20 @@ export class GgufInsightsConfigurationResolver { targetGpuLayers, targetContextSize, embeddingContext = false, - flashAttention = false, + flashAttention = "auto", kvCacheKeyType, kvCacheValueType, swaFullCache = false, - useMmap = this._ggufInsights._llama.supportsMmap + useMmap = "auto" }: { targetGpuLayers?: number | "max", targetContextSize?: number, embeddingContext?: boolean, - flashAttention?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, - useMmap?: boolean + useMmap?: "auto" | boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), @@ -114,7 +114,7 @@ export class GgufInsightsConfigurationResolver { public async scoreModelConfigurationCompatibility({ contextSize = Math.min(4096, this._ggufInsights.trainContextSize ?? 4096), embeddingContext = false, - flashAttention = false, + flashAttention = "auto", kvCacheKeyType, kvCacheValueType, swaFullCache = false, @@ -122,11 +122,11 @@ export class GgufInsightsConfigurationResolver { maximumUnfitConfigurationResourceMultiplier = 100, forceStrictContextSize = false, forceGpuLayers, - useMmap = this._ggufInsights._llama.supportsMmap + useMmap = "auto" }: { contextSize?: number, embeddingContext?: boolean, - flashAttention?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, @@ -141,7 +141,7 @@ export class GgufInsightsConfigurationResolver { forceStrictContextSize?: boolean, forceGpuLayers?: number | "max", - useMmap?: boolean + useMmap?: "auto" | boolean } = {}, { getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), @@ -179,6 +179,7 @@ export class GgufInsightsConfigurationResolver { resolvedValues: { gpuLayers: number, contextSize: number, + useMmap: boolean, modelRamUsage: number, contextRamUsage: number, @@ -201,10 +202,14 @@ export class GgufInsightsConfigurationResolver { let resolvedGpuLayers = (forceGpuLayers == null || forceGpuLayers == "max") ? this.ggufInsights.totalLayers : forceGpuLayers; + let resolvedUseMmap = useMmap === "auto" + ? this._ggufInsights._getUseMmap() + : useMmap; let gpuLayersFitMemory = false; + const simulatorSession = this._ggufInsights._createSimulatorSession(); try { - resolvedGpuLayers = await this.resolveModelGpuLayers( + const layersResolution = await this.resolveModelGpuLayersV2( forceGpuLayers != null ? forceGpuLayers : embeddingContext @@ -229,9 +234,13 @@ export class GgufInsightsConfigurationResolver { defaultContextKvCacheKeyType: kvCacheKeyType, defaultContextKvCacheValueType: kvCacheValueType, ignoreMemorySafetyChecks: forceGpuLayers != null, - useMmap + useMmap, + + _simulatorSession: simulatorSession } ); + resolvedGpuLayers = layersResolution.gpuLayers; + resolvedUseMmap = layersResolution.useMmap; gpuLayersFitMemory = true; } catch (err) { if (!(err instanceof InsufficientMemoryError)) @@ -239,9 +248,11 @@ export class GgufInsightsConfigurationResolver { } const canUseGpu = llamaSupportsGpuOffloading && llamaGpu !== false; - const estimatedModelResourceUsage = this._ggufInsights.estimateModelResourceRequirements({ + const estimatedModelResourceUsage = await this._ggufInsights.estimateModelResourceRequirementsV2({ gpuLayers: resolvedGpuLayers, - useMmap + useMmap: resolvedUseMmap, + + _simulatorSession: simulatorSession }); let resolvedContextSize = forceStrictContextSize @@ -287,7 +298,10 @@ export class GgufInsightsConfigurationResolver { flashAttention, kvCacheKeyType, kvCacheValueType, - swaFullCache + swaFullCache, + useMmap: resolvedUseMmap, + + _simulatorSession: simulatorSession }); contextFitsMemory = true; @@ -302,14 +316,17 @@ export class GgufInsightsConfigurationResolver { throw err; } - const estimatedContextResourceUsage = this._ggufInsights.estimateContextResourceRequirements({ + const estimatedContextResourceUsage = await this._ggufInsights.estimateContextResourceRequirementsV2({ contextSize: resolvedContextSize, isEmbeddingContext: embeddingContext, modelGpuLayers: resolvedGpuLayers, flashAttention, swaFullCache, kvCacheKeyType, - kvCacheValueType + kvCacheValueType, + + _simulatorSession: simulatorSession, + useMmap: resolvedUseMmap }); const rankPoints = { @@ -387,6 +404,7 @@ export class GgufInsightsConfigurationResolver { resolvedValues: { gpuLayers: resolvedGpuLayers, contextSize: resolvedContextSize, + useMmap: resolvedUseMmap, modelRamUsage: estimatedModelResourceUsage.cpuRam, contextRamUsage: estimatedContextResourceUsage.cpuRam, @@ -399,22 +417,63 @@ export class GgufInsightsConfigurationResolver { }; } - public async resolveModelGpuLayers(gpuLayers?: LlamaModelOptions["gpuLayers"], { - ignoreMemorySafetyChecks = false, - getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), - llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, - llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, - defaultContextFlashAttention = false, - defaultContextKvCacheKeyType, - defaultContextKvCacheValueType, - defaultContextSwaFullCache = false, - useMmap = this._ggufInsights._llama.supportsMmap - }: { + /** + * @deprecated use `resolveModelGpuLayersV2` instead + */ + public async resolveModelGpuLayers(gpuLayers?: LlamaModelOptions["gpuLayers"], options: { ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, - llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: boolean, + llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache?: boolean, useMmap?: boolean } = {}) { + const { + ignoreMemorySafetyChecks = false, + getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), + llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, + llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, + defaultContextFlashAttention = "auto", + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache = false, + useMmap = this._ggufInsights._defaultUseMmap ?? this._ggufInsights._llama.supportsMmap + } = options; + + return (await this.resolveModelGpuLayersV2(gpuLayers, { + ignoreMemorySafetyChecks, + getVramState, + llamaVramPaddingSize, llamaGpu, + llamaSupportsGpuOffloading, + defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache, + useMmap + })).gpuLayers; + } + + public async resolveModelGpuLayersV2(gpuLayers?: LlamaModelOptions["gpuLayers"], options: { + ignoreMemorySafetyChecks?: boolean, getVramState?(): Promise<{total: number, free: number}>, + llamaVramPaddingSize?: number, llamaGpu?: BuildGpu, llamaSupportsGpuOffloading?: boolean, defaultContextFlashAttention?: LlamaContextOptions["flashAttention"], + defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache?: boolean, + useMmap?: "auto" | boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession + } = {}) { + const { + ignoreMemorySafetyChecks = false, + getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), + llamaVramPaddingSize = this._ggufInsights._llama.vramPaddingSize, llamaGpu = this._ggufInsights._llama.gpu, + llamaSupportsGpuOffloading = this._ggufInsights._llama.supportsGpuOffloading, + defaultContextFlashAttention = "auto", + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache = false, + useMmap = "auto", + + _simulatorSession + } = options; + return resolveModelGpuLayersOption(gpuLayers, { ggufInsights: this._ggufInsights, ignoreMemorySafetyChecks, @@ -426,7 +485,9 @@ export class GgufInsightsConfigurationResolver { defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, - useMmap + useMmap, + simulatorSession: _simulatorSession, + vramCapIsSet: this._ggufInsights._llama.getVramCap() != null }); } @@ -435,28 +496,14 @@ export class GgufInsightsConfigurationResolver { * * If there's no context size that can fit the available resources, an `InsufficientMemoryError` is thrown. */ - public async resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], { - modelGpuLayers, - batchSize, - modelTrainContextSize, - flashAttention = false, - kvCacheKeyType, - kvCacheValueType, - swaFullCache = false, - getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), - getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), - getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), - llamaGpu = this._ggufInsights._llama.gpu, - ignoreMemorySafetyChecks = false, - isEmbeddingContext = false, - sequences = getDefaultContextSequences() - }: { + public async resolveContextContextSize(contextSize: LlamaContextOptions["contextSize"], options: { modelGpuLayers: number, modelTrainContextSize: number, - flashAttention?: boolean, + flashAttention?: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache?: boolean, + useMmap?: boolean, batchSize?: LlamaContextOptions["batchSize"], sequences?: number, getVramState?(): Promise<{total: number, free: number, unifiedSize: number}>, @@ -464,8 +511,31 @@ export class GgufInsightsConfigurationResolver { getSwapState?(): Promise<{total: number, free: number}>, llamaGpu?: BuildGpu, ignoreMemorySafetyChecks?: boolean, - isEmbeddingContext?: boolean + isEmbeddingContext?: boolean, + + /** @internal */ + _simulatorSession?: GgufInsightsSimulatorSession }) { + const { + modelGpuLayers, + batchSize, + modelTrainContextSize, + flashAttention = "auto", + kvCacheKeyType, + kvCacheValueType, + swaFullCache = false, + useMmap = this._ggufInsights._defaultUseMmap ?? this._ggufInsights._llama.supportsMmap, + getVramState = (() => this._ggufInsights._llama._vramOrchestrator.getMemoryState()), + getRamState = (async () => this._ggufInsights._llama._ramOrchestrator.getMemoryState()), + getSwapState = (() => this._ggufInsights._llama._swapOrchestrator.getMemoryState()), + llamaGpu = this._ggufInsights._llama.gpu, + ignoreMemorySafetyChecks = false, + isEmbeddingContext = false, + sequences = getDefaultContextSequences(), + + _simulatorSession + } = options; + return await resolveContextContextSizeOption({ contextSize, batchSize, @@ -477,12 +547,17 @@ export class GgufInsightsConfigurationResolver { kvCacheKeyType, kvCacheValueType, swaFullCache, + useMmap, getVramState, getRamState, getSwapState, llamaGpu, ignoreMemorySafetyChecks, - isEmbeddingContext + isEmbeddingContext, + + simulatorSession: _simulatorSession, + ramCapIsSet: this._ggufInsights._llama.getRamCap() != null, + vramCapIsSet: this._ggufInsights._llama.getVramCap() != null }); } diff --git a/src/gguf/insights/utils/resolveContextContextSizeOption.ts b/src/gguf/insights/utils/resolveContextContextSizeOption.ts index ba0e4ae7..f1ccf5e1 100644 --- a/src/gguf/insights/utils/resolveContextContextSizeOption.ts +++ b/src/gguf/insights/utils/resolveContextContextSizeOption.ts @@ -1,5 +1,5 @@ import {LlamaContextOptions} from "../../../evaluator/LlamaContext/types.js"; -import {GgufInsights} from "../GgufInsights.js"; +import {GgufInsights, GgufInsightsSimulatorSession} from "../GgufInsights.js"; import {BuildGpu} from "../../../bindings/types.js"; import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; @@ -9,173 +9,214 @@ import type {GgmlType} from "../../types/GgufTensorInfoTypes.js"; const defaultMaxContextSizeSwapUse = 2048; -export async function resolveContextContextSizeOption({ - contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, - kvCacheKeyType, kvCacheValueType, swaFullCache, - getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, - maxContextSizeSwapUse = defaultMaxContextSizeSwapUse -}: { +export async function resolveContextContextSizeOption(options: { contextSize?: LlamaContextOptions["contextSize"], batchSize?: LlamaContextOptions["batchSize"], sequences: number, modelFileInsights: GgufInsights, modelGpuLayers: number, modelTrainContextSize: number, - flashAttention: boolean, + flashAttention: LlamaContextOptions["flashAttention"], kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean, + useMmap?: boolean, getVramState(): Promise<{total: number, free: number, unifiedSize: number}>, getRamState(): Promise<{total: number, free: number}>, getSwapState(): Promise<{total: number, free: number}>, llamaGpu: BuildGpu, ignoreMemorySafetyChecks?: boolean, isEmbeddingContext?: boolean, - maxContextSizeSwapUse?: number + maxContextSizeSwapUse?: number, + simulatorSession?: GgufInsightsSimulatorSession, + ramCapIsSet?: boolean, + vramCapIsSet?: boolean }): Promise { - if (contextSize == null) - contextSize = "auto"; + const { + contextSize: _contextSize, batchSize, sequences, modelFileInsights, modelGpuLayers, modelTrainContextSize, flashAttention, + kvCacheKeyType, kvCacheValueType, swaFullCache, useMmap = modelFileInsights._llama.supportsMmap, + getVramState, getRamState, getSwapState, ignoreMemorySafetyChecks = false, isEmbeddingContext = false, + maxContextSizeSwapUse = defaultMaxContextSizeSwapUse, + + simulatorSession: _simulatorSession, + ramCapIsSet = false, + vramCapIsSet = false + } = options; + let contextSize = _contextSize; + + const simulatorSession = _simulatorSession ?? modelFileInsights._createSimulatorSession(); + + try { + if (contextSize == null) + contextSize = "auto"; + + if (typeof contextSize === "number") { + const resolvedContextSize = Math.max(1, Math.floor(contextSize)); + + if (ignoreMemorySafetyChecks) + return resolvedContextSize; + + const [ + vramState, + ramState, + swapState + ] = await Promise.all([ + getVramState(), + getRamState(), + getSwapState() + ]); + const contextResourceRequirements = await modelFileInsights.estimateContextResourceRequirementsV2({ + contextSize: resolvedContextSize, + batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}), + modelGpuLayers: modelGpuLayers, + sequences, + flashAttention, + kvCacheKeyType, + kvCacheValueType, + swaFullCache, + isEmbeddingContext, + + _simulatorSession: simulatorSession, + useMmap + }); - if (typeof contextSize === "number") { - const resolvedContextSize = Math.max(1, Math.floor(contextSize)); + if (contextResourceRequirements.gpuVram > vramState.free) + throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (contextResourceRequirements.cpuRam > ( + ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + )) + throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); - if (ignoreMemorySafetyChecks) return resolvedContextSize; + } else if (contextSize === "auto" || typeof contextSize === "object") { + const [ + vramState, + ramState, + swapState + ] = await Promise.all([ + getVramState(), + getRamState(), + getSwapState() + ]); + + const maxContextSize = contextSize === "auto" + ? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) + : Math.min( + contextSize.max ?? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}), + getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) + ); + + const minContextSize = contextSize === "auto" + ? minAllowedContextSizeInCalculations + : Math.max( + contextSize.min ?? minAllowedContextSizeInCalculations, + minAllowedContextSizeInCalculations + ); + + let highestCompatibleContextSize: number | null = null; + let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4)); + for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) { + const contextResourceRequirements = await modelFileInsights.estimateContextResourceRequirementsV2({ + contextSize: testContextSize, + batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}), + modelGpuLayers: modelGpuLayers, + sequences, + flashAttention, + kvCacheKeyType, + kvCacheValueType, + swaFullCache, + isEmbeddingContext, + + _simulatorSession: simulatorSession, + useMmap + }); + + if (contextResourceRequirements.gpuVram <= vramState.free && + contextResourceRequirements.cpuRam <= ( + ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + ( + testContextSize <= maxContextSizeSwapUse + ? swapState.free + : 0 + ) + ) + ) { + if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) { + highestCompatibleContextSize = testContextSize; + + if (step === -1 || testContextSize === maxContextSize) + break; + else if (step < 0) + step = Math.max(1, Math.floor(-step / 2)); + } else if (testContextSize < highestCompatibleContextSize) { + testContextSize = highestCompatibleContextSize; + step = Math.max(1, Math.floor(Math.abs(step) / 2)); + } + } else if (step > 0) + step = -Math.max(1, Math.floor(step / 2)); + + if (testContextSize == minContextSize && step === -1) + break; + + testContextSize += step; + if (testContextSize < minContextSize) { + testContextSize = minContextSize; + step = Math.max(1, Math.floor(Math.abs(step) / 2)); + } else if (testContextSize > maxContextSize) { + testContextSize = maxContextSize; + step = -Math.max(1, Math.floor(Math.abs(step) / 2)); + } + } + + if (highestCompatibleContextSize != null) + return highestCompatibleContextSize; + + if (ignoreMemorySafetyChecks) + return minContextSize; - const [ - vramState, - ramState, - swapState - ] = await Promise.all([ - getVramState(), - getRamState(), - getSwapState() - ]); - const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ - contextSize: resolvedContextSize, - batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: resolvedContextSize, sequences}), - modelGpuLayers: modelGpuLayers, - sequences, - flashAttention, - kvCacheKeyType, - kvCacheValueType, - swaFullCache, - isEmbeddingContext - }); - - if (contextResourceRequirements.gpuVram > vramState.free) - throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); - else if (contextResourceRequirements.cpuRam > ( - ramState.free + swapState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) - )) - throw new InsufficientMemoryError(`A context size of ${resolvedContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); - - return resolvedContextSize; - } else if (contextSize === "auto" || typeof contextSize === "object") { - const [ - vramState, - ramState, - swapState - ] = await Promise.all([ - getVramState(), - getRamState(), - getSwapState() - ]); - - const maxContextSize = contextSize === "auto" - ? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) - : Math.min( - contextSize.max ?? getDefaultModelContextSize({trainContextSize: modelTrainContextSize}), - getDefaultModelContextSize({trainContextSize: modelTrainContextSize}) - ); - - const minContextSize = contextSize === "auto" - ? minAllowedContextSizeInCalculations - : Math.max( - contextSize.min ?? minAllowedContextSizeInCalculations, - minAllowedContextSizeInCalculations - ); - - let highestCompatibleContextSize: number | null = null; - let step = -Math.max(1, Math.floor((maxContextSize - minContextSize) / 4)); - for (let testContextSize = maxContextSize; testContextSize >= minContextSize && testContextSize <= maxContextSize;) { - const contextResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ - contextSize: testContextSize, - batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: testContextSize, sequences}), + const minContextSizeResourceRequirements = await modelFileInsights.estimateContextResourceRequirementsV2({ + contextSize: minContextSize, + batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: minContextSize, sequences}), modelGpuLayers: modelGpuLayers, sequences, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, - isEmbeddingContext + isEmbeddingContext, + + _simulatorSession: simulatorSession, + useMmap }); - if (contextResourceRequirements.gpuVram <= vramState.free && - contextResourceRequirements.cpuRam <= ( - ramState.free - getRamUsageFromUnifiedVram(contextResourceRequirements.gpuVram, vramState) + ( - testContextSize <= maxContextSizeSwapUse - ? swapState.free - : 0 - ) - ) - ) { - if (highestCompatibleContextSize == null || testContextSize >= highestCompatibleContextSize) { - highestCompatibleContextSize = testContextSize; - - if (step === -1) - break; - else if (step < 0) - step = Math.max(1, Math.floor(-step / 2)); - } - } else if (step > 0) - step = -Math.max(1, Math.floor(step / 2)); - - if (testContextSize == minContextSize && step === -1) - break; - - testContextSize += step; - if (testContextSize < minContextSize) { - testContextSize = minContextSize; - step = Math.max(1, Math.floor(Math.abs(step) / 2)); - } else if (testContextSize > maxContextSize) { - testContextSize = maxContextSize; - step = -Math.max(1, Math.floor(Math.abs(step) / 2)); - } + const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState); + if (minContextSizeResourceRequirements.gpuVram > vramState.free && + minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage + ) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (minContextSizeResourceRequirements.gpuVram > vramState.free) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage) + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); + else + throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources` + getCapErrorMessage(ramCapIsSet, vramCapIsSet)); } - if (highestCompatibleContextSize != null) - return highestCompatibleContextSize; - - if (ignoreMemorySafetyChecks) - return minContextSize; - - const minContextSizeResourceRequirements = modelFileInsights.estimateContextResourceRequirements({ - contextSize: minContextSize, - batchSize: batchSize ?? getDefaultContextBatchSize({contextSize: minContextSize, sequences}), - modelGpuLayers: modelGpuLayers, - sequences, - flashAttention, - kvCacheKeyType, - kvCacheValueType, - swaFullCache, - isEmbeddingContext - }); - - const unifiedRamUsage = getRamUsageFromUnifiedVram(minContextSizeResourceRequirements.gpuVram, vramState); - if (minContextSizeResourceRequirements.gpuVram > vramState.free && - minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage - ) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM and RAM${swapState.total > 0 ? " (including swap)" : ""}`); - else if (minContextSizeResourceRequirements.gpuVram > vramState.free) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available VRAM`); - else if (minContextSizeResourceRequirements.cpuRam > ramState.free + swapState.free - unifiedRamUsage) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM${swapState.total > 0 ? " (including swap)" : ""}`); - else if (minContextSizeResourceRequirements.cpuRam > ramState.free - unifiedRamUsage) - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available RAM`); - else - throw new InsufficientMemoryError(`A context size of ${minContextSize}${sequences > 1 ? ` with ${sequences} sequences` : ""} is too large for the available resources`); + throw new Error(`Invalid context size: "${contextSize}"`); + } finally { + if (_simulatorSession == null) + await simulatorSession.dispose(); } +} + +function getCapErrorMessage(ramCapIsSet: boolean, vramCapIsSet: boolean) { + if (ramCapIsSet && vramCapIsSet) + return " (RAM and VRAM caps are set, consider increasing or removing the caps to allow more memory to be used)"; + else if (vramCapIsSet) + return " (VRAM cap is set, consider increasing or removing the cap to allow more VRAM to be used)"; + else if (ramCapIsSet) + return " (RAM cap is set, consider increasing or removing the cap to allow more RAM to be used)"; - throw new Error(`Invalid context size: "${contextSize}"`); + return ""; } diff --git a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts index 5c544744..fb74b749 100644 --- a/src/gguf/insights/utils/resolveModelGpuLayersOption.ts +++ b/src/gguf/insights/utils/resolveModelGpuLayersOption.ts @@ -1,104 +1,236 @@ import {LlamaModelOptions} from "../../../evaluator/LlamaModel/LlamaModel.js"; import {BuildGpu} from "../../../bindings/types.js"; import {InsufficientMemoryError} from "../../../utils/InsufficientMemoryError.js"; -import {findBestOption} from "../../../utils/findBestOption.js"; +import {findFirstNonNullBestOptionAsync} from "../../../utils/findBestOption.js"; import {getDefaultContextBatchSize, getDefaultModelContextSize} from "../../../evaluator/LlamaContext/LlamaContext.js"; import {minAllowedContextSizeInCalculations} from "../../../config.js"; import {scoreLevels} from "./scoreLevels.js"; +import type {LlamaContextOptions} from "../../../evaluator/LlamaContext/types.js"; import type {GgmlType} from "../../types/GgufTensorInfoTypes.js"; -import type {GgufInsights} from "../GgufInsights.js"; +import type {GgufInsights, GgufInsightsSimulatorSession} from "../GgufInsights.js"; const fitContextExtraMemoryPaddingPercentage = 0.5; +const vramWastePercentageToPreferDisablingMmap = 0.2; +const contextSizeMissPercentageToPreferDisablingMmap = 0.2; -export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], { - ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, - llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, - defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap -}: { +export async function resolveModelGpuLayersOption(gpuLayers: LlamaModelOptions["gpuLayers"], options: { ggufInsights: GgufInsights, ignoreMemorySafetyChecks?: boolean, getVramState(): Promise<{total: number, free: number}>, llamaVramPaddingSize: number, llamaGpu: BuildGpu, - llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: boolean, + llamaSupportsGpuOffloading: boolean, defaultContextFlashAttention: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache: boolean, - useMmap?: boolean -}): Promise { - if (gpuLayers == null) - gpuLayers = "auto"; - - if (!llamaSupportsGpuOffloading) - return 0; - - if (gpuLayers === "max" || typeof gpuLayers === "number") { - const resolvedGpuLayers = typeof gpuLayers === "number" - ? Math.max(0, Math.min(ggufInsights.totalLayers, gpuLayers)) - : ggufInsights.totalLayers; - - if (ignoreMemorySafetyChecks) - return resolvedGpuLayers; - - const vramState = await getVramState(); - const maxLayersRequirements = getVramRequiredForGpuLayers({ - gpuLayers: resolvedGpuLayers, - ggufInsights, - currentVram: vramState.free, - defaultContextFlashAttention, - defaultContextKvCacheKeyType, - defaultContextKvCacheValueType, - defaultContextSwaFullCache, - useMmap - }); - - if (maxLayersRequirements == null) - throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings"); - - return resolvedGpuLayers; - } else if (gpuLayers === "auto" || typeof gpuLayers === "object") { - if (llamaGpu === false) - return 0; - - const vramState = await getVramState(); - if (vramState.total === 0) - return 0; - - let freeVram = vramState.free; - if (typeof gpuLayers === "object" && gpuLayers.fitContext?.contextSize != null) { - freeVram -= llamaVramPaddingSize * fitContextExtraMemoryPaddingPercentage; + useMmap?: "auto" | boolean, simulatorSession?: GgufInsightsSimulatorSession, vramCapIsSet?: boolean +}): Promise<{gpuLayers: number, useMmap: boolean}> { + const { + ggufInsights, ignoreMemorySafetyChecks = false, getVramState, llamaVramPaddingSize, + llamaGpu, llamaSupportsGpuOffloading, defaultContextFlashAttention, + defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, useMmap = "auto", + simulatorSession: _simulatorSession, vramCapIsSet = false + } = options; + + const simulatorSession = _simulatorSession ?? ggufInsights._createSimulatorSession(); + + try { + if (gpuLayers == null) + gpuLayers = "auto"; + + if (!llamaSupportsGpuOffloading) + return {gpuLayers: 0, useMmap: useMmap === "auto" ? ggufInsights._getUseMmap() : useMmap}; + + if (gpuLayers === "max" || typeof gpuLayers === "number") { + const resolvedGpuLayers = typeof gpuLayers === "number" + ? Math.max(0, Math.min(ggufInsights.totalLayers, gpuLayers)) + : ggufInsights.totalLayers; + const vramState = await getVramState(); + + const getVramNeeds = (useMmap: boolean) => getVramRequiredForGpuLayers({ + gpuLayers: resolvedGpuLayers, + ggufInsights, + currentVram: vramState.free, + defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache, + useMmap, + simulatorSession + }); + const getPreferredResolvedLayers = async () => { + if (useMmap !== "auto") + return await getVramNeeds(useMmap); + + const [ + withMmap, + withoutMmap + ] = await Promise.all([ + getVramNeeds(true), + getVramNeeds(false) + ]); + + if (withoutMmap != null && withMmap == null) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + typeof gpuLayers === "number" && + withoutMmap.totalVram <= withMmap.totalVram * (1 - vramWastePercentageToPreferDisablingMmap) + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.gpuLayers > withMmap.gpuLayers + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.contextSize >= withMmap.contextSize * (1 + contextSizeMissPercentageToPreferDisablingMmap) + ) + return withoutMmap; + + return withMmap ?? withoutMmap; + }; + + if (ignoreMemorySafetyChecks) + return { + gpuLayers: resolvedGpuLayers, + useMmap: useMmap === "auto" + ? gpuLayers === "max" + ? true + : (await getPreferredResolvedLayers())?.useMmap ?? false + : useMmap + }; + + const maxLayersRequirements = (useMmap !== "auto" || gpuLayers === "max") + ? await getVramNeeds( + useMmap === "auto" + ? ggufInsights._getUseMmap() + : useMmap + ) + : await getPreferredResolvedLayers(); + + if (maxLayersRequirements == null) + throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings" + getCapErrorMessage(vramCapIsSet)); + + return { + gpuLayers: resolvedGpuLayers, + useMmap: maxLayersRequirements.useMmap + }; + } else if (gpuLayers === "auto" || typeof gpuLayers === "object") { + if (llamaGpu === false) + return {gpuLayers: 0, useMmap: useMmap === "auto" ? ggufInsights._getUseMmap() : useMmap}; + + const vramState = await getVramState(); + if (vramState.total === 0) + return {gpuLayers: 0, useMmap: useMmap === "auto" ? ggufInsights._getUseMmap() : useMmap}; + + let freeVram = vramState.free; + if (typeof gpuLayers === "object" && gpuLayers.fitContext?.contextSize != null) { + freeVram -= llamaVramPaddingSize * fitContextExtraMemoryPaddingPercentage; + + if (freeVram < 0) + freeVram = 0; + } - if (freeVram < 0) - freeVram = 0; + const getGpuLayersForMmapOptions = (useMmap: boolean) => getBestGpuLayersForFreeVram({ + ggufInsights, + freeVram, + fitContext: typeof gpuLayers === "object" + ? gpuLayers.fitContext + : undefined, + minGpuLayers: typeof gpuLayers === "object" + ? gpuLayers.min + : undefined, + maxGpuLayers: typeof gpuLayers === "object" + ? gpuLayers.max + : undefined, + defaultContextFlashAttention, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + defaultContextSwaFullCache, + useMmap, + simulatorSession + }); + const getGpuLayersForMmapOptionsWithResourceRequirements = async (useMmap: boolean) => { + const resolvedLayers = await getGpuLayersForMmapOptions(useMmap); + if (resolvedLayers == null) + return null; + + return getVramRequiredForGpuLayers({ + gpuLayers: resolvedLayers, + ggufInsights, + currentVram: freeVram, + fitContext: typeof gpuLayers === "object" + ? gpuLayers.fitContext + : undefined, + defaultContextFlashAttention, + defaultContextSwaFullCache, + defaultContextKvCacheKeyType, + defaultContextKvCacheValueType, + useMmap, + simulatorSession + }); + }; + const getPreferredResolvedLayers = async () => { + if (useMmap !== "auto") + return { + gpuLayers: await getGpuLayersForMmapOptions(useMmap), + useMmap + }; + + const [ + withMmap, + withoutMmap + ] = await Promise.all([ + getGpuLayersForMmapOptionsWithResourceRequirements(true), + getGpuLayersForMmapOptionsWithResourceRequirements(false) + ]); + + if (withoutMmap != null && withMmap == null) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + typeof gpuLayers === "number" && + withoutMmap.totalVram <= withMmap.totalVram * (1 - vramWastePercentageToPreferDisablingMmap) + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.gpuLayers > withMmap.gpuLayers + ) + return withoutMmap; + else if (withoutMmap != null && withMmap != null && + withoutMmap.contextSize >= withMmap.contextSize * (1 + contextSizeMissPercentageToPreferDisablingMmap) + ) + return withoutMmap; + + return withMmap ?? withoutMmap; + }; + + const bestGpuLayersOption = await getPreferredResolvedLayers(); + + const hasGpuLayersRequirements = typeof gpuLayers === "object" && + (gpuLayers.min != null || gpuLayers.max != null || gpuLayers.fitContext?.contextSize != null); + + if (!ignoreMemorySafetyChecks && bestGpuLayersOption == null && hasGpuLayersRequirements) + throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings" + getCapErrorMessage(vramCapIsSet)); + + return { + gpuLayers: bestGpuLayersOption?.gpuLayers ?? 0, + useMmap: bestGpuLayersOption?.useMmap ?? ( + useMmap === "auto" + ? ggufInsights._getUseMmap() + : useMmap + ) + }; } - - const bestGpuLayersOption = getBestGpuLayersForFreeVram({ - ggufInsights, - freeVram, - fitContext: typeof gpuLayers === "object" - ? gpuLayers.fitContext - : undefined, - minGpuLayers: typeof gpuLayers === "object" - ? gpuLayers.min - : undefined, - maxGpuLayers: typeof gpuLayers === "object" - ? gpuLayers.max - : undefined, - defaultContextFlashAttention, - defaultContextKvCacheKeyType, - defaultContextKvCacheValueType, - defaultContextSwaFullCache, - useMmap - }); - - const hasGpuLayersRequirements = typeof gpuLayers === "object" && - (gpuLayers.min != null || gpuLayers.max != null || gpuLayers.fitContext?.contextSize != null); - - if (!ignoreMemorySafetyChecks && bestGpuLayersOption == null && hasGpuLayersRequirements) - throw new InsufficientMemoryError("Not enough VRAM to fit the model with the specified settings"); - - return bestGpuLayersOption ?? 0; + + throw new Error(`Invalid gpuLayers value: ${gpuLayers}`); + } finally { + if (_simulatorSession == null) + await simulatorSession.dispose(); } +} + +function getCapErrorMessage(vramCapIsSet: boolean) { + if (vramCapIsSet) + return " (VRAM cap is set, consider increasing or removing the cap to fit more layers)"; - throw new Error(`Invalid gpuLayers value: ${gpuLayers}`); + return ""; } -function getBestGpuLayersForFreeVram({ +async function getBestGpuLayersForFreeVram({ ggufInsights, freeVram, fitContext, @@ -108,32 +240,35 @@ function getBestGpuLayersForFreeVram({ defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache, - useMmap + useMmap, + simulatorSession }: { ggufInsights: GgufInsights, freeVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, minGpuLayers?: number, maxGpuLayers?: number, - defaultContextFlashAttention: boolean, + defaultContextFlashAttention: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, defaultContextSwaFullCache: boolean, - useMmap?: boolean + useMmap?: boolean, + simulatorSession?: GgufInsightsSimulatorSession }) { - return findBestOption({ - *generator() { - const minLayers = Math.floor(Math.max(0, minGpuLayers ?? 0)); - const maxLayers = Math.floor(Math.min(ggufInsights.totalLayers, maxGpuLayers ?? ggufInsights.totalLayers)); + const minLayers = Math.floor(Math.max(0, minGpuLayers ?? 0)); + const maxLayers = Math.floor(Math.min(ggufInsights.totalLayers, maxGpuLayers ?? ggufInsights.totalLayers)); + return (await findFirstNonNullBestOptionAsync({ + prefill: Math.max(1, Math.min(100, Math.ceil((maxLayers - minLayers) / 3))), + *generator() { for (let layers = maxLayers; layers >= minLayers; layers--) { yield { gpuLayers: layers }; } }, - score(option) { - const layersRequirements = getVramRequiredForGpuLayers({ + async score(option) { + const layersRequirements = await getVramRequiredForGpuLayers({ gpuLayers: option.gpuLayers, ggufInsights, currentVram: freeVram, @@ -142,7 +277,8 @@ function getBestGpuLayersForFreeVram({ defaultContextSwaFullCache, defaultContextKvCacheKeyType, defaultContextKvCacheValueType, - useMmap + useMmap, + simulatorSession }); if (layersRequirements == null) @@ -153,7 +289,7 @@ function getBestGpuLayersForFreeVram({ trainContextSize: getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}) }); } - })?.gpuLayers ?? null; + }))?.gpuLayers ?? null; } function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayers: number, contextSize: number}, { @@ -180,10 +316,13 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer return scoreLevels(contextSize, [{ start: 0, - points: 2 + points: 8 + }, { + start: 512, + points: 8 }, { start: 1024, - points: 4 + points: 8 }, { start: 2048, points: gpuLayersPercentage < 0.1 ? 1 : 8 @@ -200,72 +339,87 @@ function scoreGpuLayersAndContextCombination({gpuLayers, contextSize}: {gpuLayer return scoreGpuLayers() + scoreContextSize(); } -function getVramRequiredForGpuLayers({ - gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = false, - defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache = false, useMmap +async function getVramRequiredForGpuLayers({ + gpuLayers, ggufInsights, currentVram, fitContext, defaultContextFlashAttention = "auto", + defaultContextKvCacheKeyType, defaultContextKvCacheValueType, defaultContextSwaFullCache = false, useMmap = ggufInsights._getUseMmap(), + simulatorSession }: { gpuLayers: number, ggufInsights: GgufInsights, currentVram: number, fitContext?: {contextSize?: number, embeddingContext?: boolean}, - defaultContextFlashAttention: boolean, defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, - defaultContextSwaFullCache: boolean, useMmap?: boolean + defaultContextFlashAttention: LlamaContextOptions["flashAttention"], defaultContextKvCacheKeyType?: GgmlType, defaultContextKvCacheValueType?: GgmlType, + defaultContextSwaFullCache: boolean, useMmap?: boolean, + simulatorSession?: GgufInsightsSimulatorSession }) { - const modelVram = ggufInsights.estimateModelResourceRequirements({ + const heuristicFlashAttention = defaultContextFlashAttention === true; + const modelVram = (await ggufInsights.estimateModelResourceRequirementsV2({ gpuLayers, - useMmap - }).gpuVram; + useMmap, + _simulatorSession: simulatorSession + })).gpuVram; if (modelVram > currentVram) return null; if (fitContext != null && fitContext.contextSize != null) { - const contextVram = ggufInsights.estimateContextResourceRequirements({ + const contextVram = (await ggufInsights.estimateContextResourceRequirementsV2({ contextSize: fitContext.contextSize, batchSize: getDefaultContextBatchSize({contextSize: fitContext.contextSize, sequences: 1}), modelGpuLayers: gpuLayers, sequences: 1, isEmbeddingContext: fitContext.embeddingContext ?? false, - flashAttention: defaultContextFlashAttention, + flashAttention: heuristicFlashAttention, kvCacheKeyType: defaultContextKvCacheKeyType, kvCacheValueType: defaultContextKvCacheValueType, - swaFullCache: defaultContextSwaFullCache - }).gpuVram; + swaFullCache: defaultContextSwaFullCache, + + _simulatorSession: simulatorSession, + useMmap + })).gpuVram; const totalVram = modelVram + contextVram; if (totalVram > currentVram) return null; return { + gpuLayers, contextSize: fitContext.contextSize, contextVram, - totalVram + totalVram, + useMmap }; } - const maxContext = findMaxPossibleContextSizeForVram({ + const maxContext = await findMaxPossibleContextSizeForVram({ gpuLayers, ggufInsights, vram: currentVram - modelVram, isEmbeddingContext: fitContext?.embeddingContext ?? false, - flashAttention: defaultContextFlashAttention, + flashAttention: heuristicFlashAttention, kvCacheKeyType: defaultContextKvCacheKeyType, kvCacheValueType: defaultContextKvCacheValueType, - swaFullCache: defaultContextSwaFullCache + swaFullCache: defaultContextSwaFullCache, + useMmap, + simulatorSession }); if (maxContext == null || modelVram + maxContext.vram > currentVram) return null; return { + gpuLayers, contextSize: maxContext.contextSize, contextVram: maxContext.vram, - totalVram: modelVram + maxContext.vram + totalVram: modelVram + maxContext.vram, + useMmap }; } -function findMaxPossibleContextSizeForVram({ - gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache +async function findMaxPossibleContextSizeForVram({ + gpuLayers, ggufInsights, vram, isEmbeddingContext, flashAttention, kvCacheKeyType, kvCacheValueType, swaFullCache, + useMmap, simulatorSession }: { gpuLayers: number, ggufInsights: GgufInsights, vram: number, isEmbeddingContext: boolean, flashAttention: boolean, - kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean + kvCacheKeyType?: GgmlType, kvCacheValueType?: GgmlType, swaFullCache: boolean, useMmap?: boolean, + simulatorSession?: GgufInsightsSimulatorSession }) { const maxContextSize = getDefaultModelContextSize({trainContextSize: ggufInsights.trainContextSize}); @@ -273,8 +427,8 @@ function findMaxPossibleContextSizeForVram({ maxValue: maxContextSize, minValue: minAllowedContextSizeInCalculations, minStep: 1, - test(contextSize) { - const contextVram = ggufInsights.estimateContextResourceRequirements({ + async test(contextSize) { + const contextVram = (await ggufInsights.estimateContextResourceRequirementsV2({ contextSize, batchSize: getDefaultContextBatchSize({contextSize, sequences: 1}), modelGpuLayers: gpuLayers, @@ -283,8 +437,11 @@ function findMaxPossibleContextSizeForVram({ flashAttention, kvCacheKeyType, kvCacheValueType, - swaFullCache - }).gpuVram; + swaFullCache, + + _simulatorSession: simulatorSession, + useMmap: useMmap + })).gpuVram; if (contextVram <= vram) return { @@ -297,7 +454,7 @@ function findMaxPossibleContextSizeForVram({ }); } -function findMaxValidValue({ +async function findMaxValidValue({ maxValue, minValue, minStep = 1, @@ -306,29 +463,26 @@ function findMaxValidValue({ maxValue: number, minValue: number, minStep?: number, - test(value: number): T | null -}): T | null { + test(value: number): Promise +}): Promise { let step = -Math.max(minStep, Math.floor((maxValue - minValue) / 4)); let bestValue: null | {value: number, result: T} = null; for (let value = maxValue; value >= minValue;) { const result: T | null = (bestValue != null && value === bestValue.value) ? bestValue.result - : test(value); + : await test(value); - if (result != null) { - if (bestValue == null || value >= bestValue.value) { - bestValue = {value: value, result: result}; + if (result != null && (bestValue == null || value >= bestValue.value)) { + bestValue = {value: value, result: result}; - if (step === -minStep) - break; - else if (step < 0) - step = Math.max(minStep, Math.floor(-step / 2)); - } + if (step === -minStep || value === maxValue) + break; + else if (step < 0) + step = Math.max(minStep, Math.floor(-step / 2)); } else if (bestValue != null && value < bestValue.value) { value = bestValue.value; step = Math.max(minStep, Math.floor(Math.abs(step) / 2)); - continue; } else if (step > 0) step = -Math.max(minStep, Math.floor(step / 2)); @@ -337,7 +491,9 @@ function findMaxValidValue({ value += step; if (value < minValue) { - value = minValue; + value = bestValue != null + ? Math.max(bestValue.value, minValue) + : minValue; step = Math.max(minStep, Math.floor(Math.abs(step) / 2)); } else if (value > maxValue) { value = maxValue; diff --git a/src/gguf/parser/GgufV2Parser.ts b/src/gguf/parser/GgufV2Parser.ts index cdd66bc4..69b7b824 100644 --- a/src/gguf/parser/GgufV2Parser.ts +++ b/src/gguf/parser/GgufV2Parser.ts @@ -62,7 +62,7 @@ export class GgufV2Parser { tensorInfo: tensorReadResult?.tensorInfo, metadataSize: headerReadResult.headerSize + initialOffset, tensorInfoSize: tensorReadResult?.tensorInfoSize, - tensorDataOffset: tensorReadResult?.tensorDataOffset + infoEndOffset: tensorReadResult?.infoEndOffset }; } @@ -201,7 +201,7 @@ export class GgufV2Parser { return { tensorInfo, tensorInfoSize: readOffset.offset - initialOffset, - tensorDataOffset: fileTensorDataOffset + infoEndOffset: fileTensorDataOffset }; } }); diff --git a/src/gguf/parser/parseGguf.ts b/src/gguf/parser/parseGguf.ts index 8f7a9919..ef4d2ead 100644 --- a/src/gguf/parser/parseGguf.ts +++ b/src/gguf/parser/parseGguf.ts @@ -3,8 +3,10 @@ import {getConsoleLogPrefix} from "../../utils/getConsoleLogPrefix.js"; import {UnsupportedError} from "../../utils/UnsupportedError.js"; import {GgufReadOffset} from "../utils/GgufReadOffset.js"; import {GgufFileReader} from "../fileReaders/GgufFileReader.js"; -import {GgufFileInfo, GgufVersionParserOptions, GgufVersionParserResult} from "../types/GgufFileInfoTypes.js"; +import {GgufFileInfo, GgufFileInfoSourceData, GgufVersionParserOptions, GgufVersionParserResult} from "../types/GgufFileInfoTypes.js"; import {getGgufMetadataArchitectureData} from "../utils/getGgufMetadataArchitectureData.js"; +import {GgufFsFileReader} from "../fileReaders/GgufFsFileReader.js"; +import {Promisable, transformPromisable} from "../../utils/transformPromisable.js"; import {GgufV2Parser} from "./GgufV2Parser.js"; import {GgufV3Parser} from "./GgufV3Parser.js"; @@ -33,11 +35,21 @@ export async function parseGguf({ logWarnings }); const architectureMetadata = getGgufMetadataArchitectureData(ggufInfo.metadata); + const sourceData: Promisable | undefined = ggufInfo.infoEndOffset == null + ? undefined + : (fileReader instanceof GgufFsFileReader) + ? { + type: "path", + path: fileReader.filePath, + length: ggufInfo.infoEndOffset + } + : transformPromisable(fileReader.readByteRange(0, ggufInfo.infoEndOffset), createGgufFileInfoSourceDataFromBuffer); return { version: magicAndVersion.version, tensorCount: ggufInfo.tensorCount, metadata: ggufInfo.metadata, + infoEndOffset: ggufInfo.infoEndOffset, architectureMetadata: architectureMetadata, tensorInfo: ggufInfo.tensorInfo, metadataSize: ggufInfo.metadataSize, @@ -45,6 +57,11 @@ export async function parseGguf({ totalTensorInfoSize: ggufInfo.tensorInfoSize, totalTensorCount: ggufInfo.tensorCount, totalMetadataSize: ggufInfo.metadataSize, + sourceData: sourceData == null + ? [] + : sourceData instanceof Promise + ? [await sourceData] + : [sourceData], fullTensorInfo: ggufInfo.tensorInfo, tensorInfoSize: ggufInfo.tensorInfoSize }; @@ -87,3 +104,10 @@ async function parseGgufUsingASpecificVersionParser( return await (new GgufV3Parser(specificVersionParserOptions)).parse(); } } + +function createGgufFileInfoSourceDataFromBuffer(buffer: Buffer): GgufFileInfoSourceData { + return { + type: "buffer", + buffer + }; +} diff --git a/src/gguf/readGgufFileInfo.ts b/src/gguf/readGgufFileInfo.ts index f57a0669..1f18c6e8 100644 --- a/src/gguf/readGgufFileInfo.ts +++ b/src/gguf/readGgufFileInfo.ts @@ -85,6 +85,11 @@ export async function readGgufFileInfo(pathOrUri: string, { endpoints?: ModelDownloadEndpoints } = {}) { const useNetworkReader = sourceType === "network" || (sourceType == null && (isUrl(pathOrUri) || isModelUri(pathOrUri))); + function createSource(pathOrUri: string) { + return useNetworkReader + ? {type: "uri" as const, uri: pathOrUri} + : {type: "path" as const, path: pathOrUri}; + } async function createFileReader(pathOrUri: string) { if (useNetworkReader) { @@ -125,6 +130,8 @@ export async function readGgufFileInfo(pathOrUri: string, { (tensor as Writable).filePart = splitPartNumber; } + (res as Writable).source = createSource(pathOrUri); + return res; } @@ -147,6 +154,7 @@ export async function readGgufFileInfo(pathOrUri: string, { version: first.version, tensorCount: first.tensorCount, metadata: first.metadata, + infoEndOffset: first.infoEndOffset, architectureMetadata: first.architectureMetadata, tensorInfo: first.tensorInfo, metadataSize: first.metadataSize, @@ -159,6 +167,8 @@ export async function readGgufFileInfo(pathOrUri: string, { fullTensorInfo: first.fullTensorInfo == null ? undefined : [first, ...rest].flatMap((part) => (part.fullTensorInfo ?? [])), - tensorInfoSize: first.tensorInfoSize + tensorInfoSize: first.tensorInfoSize, + source: createSource(pathOrUri), + sourceData: [first, ...rest].flatMap((part) => part.sourceData) } satisfies GgufFileInfo; } diff --git a/src/gguf/types/GgufFileInfoTypes.ts b/src/gguf/types/GgufFileInfoTypes.ts index 0ff9a20c..f506698f 100644 --- a/src/gguf/types/GgufFileInfoTypes.ts +++ b/src/gguf/types/GgufFileInfoTypes.ts @@ -16,6 +16,13 @@ export type GgufFileInfo = { readonly metadata: GgufMetadata, readonly metadataSize: number, + /** + * Offset in bytes from the start of the file to the end of the preserved GGUF info section. + * This includes the header, key-value metadata, tensor info and the alignment padding up to the tensor data section. + * Can be null if `readTensorInfo` is set to `false`. + */ + readonly infoEndOffset?: number, + /** Same value as `metadata[metadata.general.architecture]`, but with merged types for convenience */ readonly architectureMetadata: MergeOptionalUnionTypes>, @@ -60,7 +67,41 @@ export type GgufFileInfo = { * * When no splicing is done, this will be the same as `tensorInfoSize`. */ - readonly totalTensorInfoSize?: number + readonly totalTensorInfoSize?: number, + + /** + * An array of source data entries from which the file info was read. + * Each entry can be either a file path or a read-only buffer containing the raw GGUF metadata section part of the file + * (including the header, key-value pairs, tensor info and alignment padding up to the tensor data section). + * + * For a single source file, this array will contain only a single entry, + * but for spliced metadata from multiple file parts, this array will contain an entry for each part, in the order they were spliced. + * + * When `readTensorInfo` is set to `false`, this will be an empty array. + */ + readonly sourceData: GgufFileInfoSourceData[], + + /** + * Indication of the source of the GGUF file info, such as the file path or URI it was read from. + */ + readonly source?: GgufFileInfoSource +}; + +export type GgufFileInfoSource = { + type: "path", + path: string +} | { + type: "uri", + uri: string +}; + +export type GgufFileInfoSourceData = { + type: "path", + path: string, + length: number +} | { + type: "buffer", + buffer: Readonly }; @@ -97,5 +138,5 @@ export type GgufVersionParserResult = { tensorInfo?: GgufTensorInfo[], metadataSize: number, tensorInfoSize?: number, - tensorDataOffset?: number + infoEndOffset?: number }; diff --git a/src/gguf/types/GgufMetadataTypes.ts b/src/gguf/types/GgufMetadataTypes.ts index 249cad32..3d53e764 100644 --- a/src/gguf/types/GgufMetadataTypes.ts +++ b/src/gguf/types/GgufMetadataTypes.ts @@ -47,6 +47,7 @@ export const enum GgufArchitectureType { gemma2 = "gemma2", gemma3 = "gemma3", gemma3n = "gemma3n", + gemma4 = "gemma4", gemmaEmbedding = "gemma-embedding", starcoder2 = "starcoder2", mamba = "mamba", @@ -64,6 +65,7 @@ export const enum GgufArchitectureType { arctic = "arctic", deepseek = "deepseek", deepseek2 = "deepseek2", + deepseek2ocr = "deepseek2-ocr", chatglm = "chatglm", glm4 = "glm4", glm4moe = "glm4moe", @@ -195,7 +197,8 @@ export enum GgufFileType { MOSTLY_TQ1_0 = 36, MOSTLY_TQ2_0 = 37, MOSTLY_MXFP4_MOE = 38, - MOSTLY_NVFP4 = 39 + MOSTLY_NVFP4 = 39, + MOSTLY_Q1_0 = 40 } @@ -299,13 +302,13 @@ export const enum GgufMetadataTokenizerTokenType { export type GgufMetadataTokenizer = { readonly ggml: { - readonly model: "no_vocab" | "none" | "llama" | "gpt2" | "bert" | "rwkv" | "t5" | "plamo2" | string, + readonly model: "no_vocab" | "none" | "llama" | "gpt2" | "bert" | "rwkv" | "t5" | "plamo2" | "gemma4" | string, readonly pre?: "default" | "llama3" | "llama-v3" | "llama-bpe" | "deepseek-llm" | "deepseek-coder" | "falcon" | "falcon3" | "pixtral" | "mpt" | "starcoder" | "gpt-2" | "phi-2" | "jina-es" | "jina-de" | "jina-v1-en" | "jina-v2-es" | "jina-v2-de" | "jina-v2-code" | "refact" | "command-r" | "qwen2" | "stablelm2" | "olmo" | "dbrx" | "smaug-bpe" | "poro-chat" | "chatglm-bpe" | "viking" | "jais" | "tekken" | "smollm" | "codeshell" | "bloom" | "gpt3-finnish" | "exaone" | "exaone4" | "chameleon" | "minerva-7b" | "megrez" | "gpt-4o" | "superbpe" | "trillion" | "bailingmoe" | "a.x-4.0" | "mellum" | "modern-bert" | - "roberta-bpe" | "deepseek-r1-qwen" | "kormo" | "qwen35" | string, + "roberta-bpe" | "deepseek-r1-qwen" | "kormo" | "qwen35" | "gemma4" | string, readonly tokens: readonly string[], readonly token_type: GgufMetadataTokenizerTokenType[], readonly token_type_count?: number, @@ -378,8 +381,11 @@ export type GgufMetadataDefaultArchitectureType = { readonly layer_norm_rms_epsilon?: number, readonly key_length?: number, readonly value_length?: number, + readonly key_length_swa?: number, + readonly value_length_swa?: number, readonly sliding_window?: number, readonly sliding_window_pattern?: number | number[], + readonly shared_kv_layers?: number, readonly causal?: boolean }, diff --git a/src/gguf/types/GgufTensorInfoTypes.ts b/src/gguf/types/GgufTensorInfoTypes.ts index 39e2b984..42b038a5 100644 --- a/src/gguf/types/GgufTensorInfoTypes.ts +++ b/src/gguf/types/GgufTensorInfoTypes.ts @@ -61,7 +61,8 @@ export enum GgmlType { IQ4_NL_4_8 = 37, IQ4_NL_8_8 = 38, MXFP4 = 39, // MXFP4 (1 block) - NVFP4 = 40 // NVFP4 (4 blocks, E4M3 scale) + NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) + Q1_0 = 41 } export function resolveGgmlTypeOption(option?: keyof typeof GgmlType | GgmlType) { diff --git a/src/gguf/utils/ggufQuantNames.ts b/src/gguf/utils/ggufQuantNames.ts index 3e2c5c65..b56a166a 100644 --- a/src/gguf/utils/ggufQuantNames.ts +++ b/src/gguf/utils/ggufQuantNames.ts @@ -1,10 +1,11 @@ import {GgufFileType} from "../types/GgufMetadataTypes.js"; export const ggufQuantNames = new Map([ + ["Q1_0", GgufFileType.MOSTLY_Q1_0], ["Q4_0", GgufFileType.MOSTLY_Q4_0], ["Q4_1", GgufFileType.MOSTLY_Q4_1], - ["MXFP4", GgufFileType.MOSTLY_MXFP4_MOE], - ["NVFP4", GgufFileType.MOSTLY_MXFP4_MOE], + ["MXFP4_MOE", GgufFileType.MOSTLY_MXFP4_MOE], + ["NVFP4", GgufFileType.MOSTLY_NVFP4], ["Q5_0", GgufFileType.MOSTLY_Q5_0], ["Q5_1", GgufFileType.MOSTLY_Q5_1], ["IQ2_XXS", GgufFileType.MOSTLY_IQ2_XXS], diff --git a/src/index.ts b/src/index.ts index 66d254fb..d773ecda 100644 --- a/src/index.ts +++ b/src/index.ts @@ -62,6 +62,7 @@ import {FalconChatWrapper} from "./chatWrappers/FalconChatWrapper.js"; import {AlpacaChatWrapper} from "./chatWrappers/AlpacaChatWrapper.js"; import {FunctionaryChatWrapper} from "./chatWrappers/FunctionaryChatWrapper.js"; import {GemmaChatWrapper} from "./chatWrappers/GemmaChatWrapper.js"; +import {Gemma4ChatWrapper} from "./chatWrappers/Gemma4ChatWrapper.js"; import {HarmonyChatWrapper} from "./chatWrappers/HarmonyChatWrapper.js"; import {TemplateChatWrapper, type TemplateChatWrapperOptions} from "./chatWrappers/generic/TemplateChatWrapper.js"; import { @@ -108,7 +109,7 @@ import { type GbnfJsonBasicStringSchema, type GbnfJsonFormatStringSchema, type GbnfJsonObjectSchema, type GbnfJsonOneOfSchema, type GbnfJsonSchema, type GbnfJsonSchemaImmutableType, type GbnfJsonSchemaToType } from "./utils/gbnfJson/types.js"; -import {type GgufFileInfo} from "./gguf/types/GgufFileInfoTypes.js"; +import {type GgufFileInfo, type GgufFileInfoSource, type GgufFileInfoSourceData} from "./gguf/types/GgufFileInfoTypes.js"; import { type GgufMetadata, type GgufMetadataLlmToType, GgufArchitectureType, GgufFileType, GgufMetadataTokenizerTokenType, GgufMetadataArchitecturePoolingType, type GgufMetadataGeneral, type GgufMetadataTokenizer, type GgufMetadataDefaultArchitectureType, @@ -231,6 +232,7 @@ export { AlpacaChatWrapper, FunctionaryChatWrapper, GemmaChatWrapper, + Gemma4ChatWrapper, HarmonyChatWrapper, TemplateChatWrapper, type TemplateChatWrapperOptions, @@ -300,6 +302,8 @@ export { LlamaLogLevelGreaterThanOrEqual, readGgufFileInfo, type GgufFileInfo, + type GgufFileInfoSource, + type GgufFileInfoSourceData, type GgufMetadata, type GgufTensorInfo, type GgufMetadataLlmToType, diff --git a/src/utils/GitHubClient.ts b/src/utils/GitHubClient.ts new file mode 100644 index 00000000..301a10ec --- /dev/null +++ b/src/utils/GitHubClient.ts @@ -0,0 +1,184 @@ +const defaultGitHubApiBase = "https://api.github.com"; +const defaultGitHubApiVersion: GitHubApiVersion = "2022-11-28"; + +type GitHubApiVersion = "2022-11-28" | (string & {}); + +type GitHubClientOptions = { + token?: string, + + /** + * GitHub REST API base URL. + * + * Defaults to `https://api.github.com`. + */ + apiBase?: string, + + /** + * GitHub REST API version header. + * + * Defaults to `"2022-11-28"`. + */ + apiVersion?: GitHubApiVersion, + + userAgent?: string +}; + +export type GitHubRelease = { + url: string, + "html_url": string, + "assets_url": string, + "upload_url": string, + + id: number, + "node_id": string, + + "tag_name": string, + "target_commitish": string, + name: string | null, + body: string | null, + + draft: boolean, + prerelease: boolean, + + "created_at": string, // ISO date-time + "published_at": string | null, // ISO date-time + + author: GitHubUser | null, + + assets: GitHubReleaseAsset[], + + "tarball_url": string | null, + "zipball_url": string | null +}; + +export type GitHubUser = { + login: string, + id: number, + "node_id": string, + "avatar_url": string, + "html_url": string, + type: string, + "site_admin": boolean +}; + +export type GitHubReleaseAsset = { + url: string, + id: number, + "node_id": string, + + name: string, + label: string | null, + "content_type": string, + state: string, + size: number, + "download_count": number, + + "browser_download_url": string, + + "created_at": string, // ISO date-time + "updated_at": string, // ISO date-time + + uploader: GitHubUser | null +}; + +export type GitHubApiError = Error & { + status: number, + url: string, + bodyText?: string, + headers?: Record +}; + +export type GitHubPullRequestStatus = { + merged: boolean, + mergeable: true | false | null, + merged_at: string | null +}; + +export class GitHubClient { + private readonly _clientOptions: GitHubClientOptions; + + public constructor(clientOptions: GitHubClientOptions = {}) { + this._clientOptions = clientOptions; + } + + public async getLatestRelease({ + owner, repo + }: { + owner: string, + repo: string + }): Promise { + return this._fetchJson( + `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/latest` + ); + } + + public async getReleaseByTag({ + owner, repo, tag + }: { + owner: string, + repo: string, + tag: string + }): Promise { + return this._fetchJson( + `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/tags/${encodeURIComponent(tag)}` + ); + } + + public async getPullRequestStatus({ + owner, repo, id + }: { + owner: string, + repo: string, + id: string + }): Promise { + return this._fetchJson( + `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/pulls/${encodeURIComponent(id)}` + ); + } + + private async _fetchJson( + path: string + ): Promise { + const url = this._getApiBase() + path; + + const headers: Record = { + Accept: "application/vnd.github+json", + "X-GitHub-Api-Version": this._clientOptions.apiVersion ?? defaultGitHubApiVersion + }; + + if (this._clientOptions.token != null && this._clientOptions.token !== "") + headers.Authorization = "Bearer " + this._clientOptions.token; + + if (this._clientOptions.userAgent != null && this._clientOptions.userAgent !== "") + headers["User-Agent"] = this._clientOptions.userAgent; + + + const res = await fetch(url, { + method: "GET", + headers + }); + + if (!res.ok) { + const err = new Error( + `GitHub API error ${res.status} ${res.statusText}` + ) as GitHubApiError; + + err.status = res.status; + err.url = url; + err.headers = Object.fromEntries(res.headers.entries()); + try { + err.bodyText = await res.text(); + } catch { + err.bodyText = undefined; + } + + throw err; + } + + return (await res.json()) as T; + } + + private _getApiBase() { + return this._clientOptions?.apiBase ?? defaultGitHubApiBase; + } +} diff --git a/src/utils/LlamaText.ts b/src/utils/LlamaText.ts index 6675b762..70ba0ec2 100644 --- a/src/utils/LlamaText.ts +++ b/src/utils/LlamaText.ts @@ -122,6 +122,10 @@ class LlamaText { return LlamaTextConstructor.compare(this, other); } + public trim(): LlamaText { + return this.trimStart().trimEnd(); + } + public trimStart(): LlamaText { const newValues = this.values.slice(); diff --git a/src/utils/LruCache.ts b/src/utils/LruCache.ts index 6d6b40a2..7f44cd17 100644 --- a/src/utils/LruCache.ts +++ b/src/utils/LruCache.ts @@ -52,6 +52,10 @@ export class LruCache { return this._cache.keys(); } + public values() { + return this._cache.values(); + } + public delete(key: Key) { this._cache.delete(key); } diff --git a/src/utils/findBestOption.ts b/src/utils/findBestOption.ts index a14b2f16..8348ed31 100644 --- a/src/utils/findBestOption.ts +++ b/src/utils/findBestOption.ts @@ -19,3 +19,118 @@ export function findBestOption({generator, score}: { return bestOption; } + +/** + * This algorithm assumes that the first non-null score is the best one and from there + * it then starts iterating by 1 index forward to find the actual best option. + * + * It prefills the next `prefill` options to jump ahead an score fewer options to find the best one faster and more efficiently. + */ +export async function findFirstNonNullBestOptionAsync({generator, score, prefill}: { + generator: () => Generator, + score: (option: O) => Promise, + prefill: number +}): Promise { + const iterator = generator(); + let iteratorDone = false; + const options: O[] = []; + const scores = new Map(); + + function getIndex(index: number) { + if (index < options.length) + return options[index]!; + + if (iteratorDone) + return undefined; + + while (options.length <= index) { + const nextOption = iterator.next(); + if (nextOption.done) { + iteratorDone = true; + return undefined; + } + + options.push(nextOption.value); + } + + return options[index]; + } + + let step = Math.max(1, (Number.isFinite(prefill) ? Math.floor(prefill) : 5)); + let currentIndex = 0; + let bestIndex: number | null = null; + let bestScore: number | null = null; + while (true) { + if (currentIndex < 0) + currentIndex = 0; + + const option = getIndex(currentIndex); + if (option == null) + break; + + const currentScore = scores.get(currentIndex) ?? await score(option); + if (!scores.has(currentIndex)) + scores.set(currentIndex, currentScore); + + if (currentScore == null) { + if (step < 0) + step = Math.max(1, Math.floor(-step / 2)); + + while (bestIndex != null && currentIndex + step >= bestIndex && step !== 1) + step = Math.max(1, Math.floor(step / 2)); + + let nextIndex = currentIndex + step; + if (getIndex(nextIndex) == null) { + nextIndex = options.length - 1; + if (currentIndex === nextIndex) + break; + } + currentIndex = nextIndex; + } else if (bestScore == null || currentScore > bestScore) { + bestIndex = currentIndex; + bestScore = currentScore; + + step = -Math.max(1, Math.floor(Math.abs(step) / 2)); + + let nextIndex = currentIndex + step; + if (nextIndex < 0) { + nextIndex = 0; + step = Math.max(1, Math.floor(Math.abs(step) / 2)); + nextIndex = currentIndex + step; + } + + if (getIndex(nextIndex) == null) { + nextIndex = options.length - 1; + if (currentIndex === nextIndex) + break; + } + + currentIndex = nextIndex; + } else if (bestIndex != null && currentScore < bestScore && currentIndex > bestIndex) { + step = -Math.max(1, Math.floor(Math.abs(currentIndex - bestIndex) / 2)); + currentIndex = bestIndex + step; + } else if (bestIndex != null && currentScore < bestScore && currentIndex < bestIndex) { + if (step < 0) + step = Math.max(1, Math.floor(Math.abs(bestIndex - currentIndex) / 2)); + + currentIndex = currentIndex + step; + } else if (currentScore === bestScore && currentIndex === bestIndex && + (step === 1 || currentIndex === 0) + ) { + if (scores.has(currentIndex + 1) || (iteratorDone && currentIndex === options.length - 1)) + break; + + step = 1; + currentIndex = bestIndex + step; + } else + currentIndex = currentIndex + step; + + if (iteratorDone && scores.size === options.length && bestIndex != null) + break; + } + + if (bestIndex == null) + return null; + + return options[bestIndex] ?? null; +} diff --git a/src/utils/gitReleaseBundles.ts b/src/utils/gitReleaseBundles.ts index e2159932..bc8dd5fb 100644 --- a/src/utils/gitReleaseBundles.ts +++ b/src/utils/gitReleaseBundles.ts @@ -108,22 +108,26 @@ async function unshallowAndSquashCurrentRepoWithSubmodulesAndSaveItAsReleaseBund } export async function getGitBundlePathForRelease(githubOwner: string, githubRepo: string, release: string) { + if (!(await isGitBundleCompatible(githubOwner, githubRepo, release))) + return null; + + if (!(await fs.pathExists(currentReleaseGitBundlePath))) + return null; + + return currentReleaseGitBundlePath; +} + +export async function isGitBundleCompatible(githubOwner: string, githubRepo: string, release: string) { const [builtinGithubOwner, builtinGithubRepo] = builtinLlamaCppGitHubRepo.split("/"); if (githubOwner !== builtinGithubOwner || githubRepo !== builtinGithubRepo) - return null; + return false; const currentBundleRelease = await getBinariesGithubRelease(); if (isGithubReleaseNeedsResolving(currentBundleRelease)) - return null; - - if (currentBundleRelease !== release) - return null; + return false; - if (!(await fs.pathExists(currentReleaseGitBundlePath))) - return null; - - return currentReleaseGitBundlePath; + return currentBundleRelease === release; } async function getCurrentTagOrBranch() { diff --git a/src/utils/resolveGithubRelease.ts b/src/utils/resolveGithubRelease.ts index ac280b94..cabc4022 100644 --- a/src/utils/resolveGithubRelease.ts +++ b/src/utils/resolveGithubRelease.ts @@ -1,6 +1,10 @@ import {getConsoleLogPrefix} from "./getConsoleLogPrefix.js"; +import {GitHubClient, GitHubRelease} from "./GitHubClient.js"; -export async function resolveGithubRelease(githubOwner: string, githubRepo: string, release: string) { +export async function resolveGithubRelease(githubOwner: string, githubRepo: string, release: string): Promise<{ + tag: string, + date: Date +}> { const githubClient = new GitHubClient(); const repo = githubOwner + "/" + githubRepo; @@ -30,176 +34,13 @@ export async function resolveGithubRelease(githubOwner: string, githubRepo: stri if (githubRelease.tag_name == null) throw new Error(`Failed to find tag of release "${release}" of "${repo}"`); - return githubRelease.tag_name; + return { + tag: githubRelease.tag_name, + date: new Date(githubRelease.created_at) + }; } export function isGithubReleaseNeedsResolving(release: string) { return release === "latest"; } -const defaultGitHubApiBase = "https://api.github.com"; -const defaultGitHubApiVersion: GitHubApiVersion = "2022-11-28"; - -type GitHubApiVersion = "2022-11-28" | (string & {}); - -type GitHubClientOptions = { - token?: string, - - /** - * GitHub REST API base URL. - * - * Defaults to `https://api.github.com`. - */ - apiBase?: string, - - /** - * GitHub REST API version header. - * - * Defaults to `"2022-11-28"`. - */ - apiVersion?: GitHubApiVersion, - - userAgent?: string -}; - -type GitHubRelease = { - url: string, - "html_url": string, - "assets_url": string, - "upload_url": string, - - id: number, - "node_id": string, - - "tag_name": string, - "target_commitish": string, - name: string | null, - body: string | null, - - draft: boolean, - prerelease: boolean, - - "created_at": string, // ISO date-time - "published_at": string | null, // ISO date-time - - author: GitHubUser | null, - - assets: GitHubReleaseAsset[], - - "tarball_url": string | null, - "zipball_url": string | null -}; - -type GitHubUser = { - login: string, - id: number, - "node_id": string, - "avatar_url": string, - "html_url": string, - type: string, - "site_admin": boolean -}; - -type GitHubReleaseAsset = { - url: string, - id: number, - "node_id": string, - - name: string, - label: string | null, - "content_type": string, - state: string, - size: number, - "download_count": number, - - "browser_download_url": string, - - "created_at": string, // ISO date-time - "updated_at": string, // ISO date-time - - uploader: GitHubUser | null -}; - -type GitHubApiError = Error & { - status: number, - url: string, - bodyText?: string, - headers?: Record -}; - -class GitHubClient { - private readonly _clientOptions: GitHubClientOptions; - - public constructor(clientOptions: GitHubClientOptions = {}) { - this._clientOptions = clientOptions; - } - - public async getLatestRelease({ - owner, repo - }: { - owner: string, - repo: string - }): Promise { - return this._fetchJson( - `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/latest` - ); - } - - public async getReleaseByTag({ - owner, repo, tag - }: { - owner: string, - repo: string, - tag: string - }): Promise { - return this._fetchJson( - `/repos/${encodeURIComponent(owner)}/${encodeURIComponent(repo)}/releases/tags/${encodeURIComponent(tag)}` - ); - } - - private async _fetchJson( - path: string - ): Promise { - const url = this._getApiBase() + path; - - const headers: Record = { - Accept: "application/vnd.github+json", - "X-GitHub-Api-Version": this._clientOptions.apiVersion ?? defaultGitHubApiVersion - }; - - if (this._clientOptions.token != null && this._clientOptions.token !== "") - headers.Authorization = "Bearer " + this._clientOptions.token; - - if (this._clientOptions.userAgent != null && this._clientOptions.userAgent !== "") - headers["User-Agent"] = this._clientOptions.userAgent; - - - const res = await fetch(url, { - method: "GET", - headers - }); - - if (!res.ok) { - const err = new Error( - `GitHub API error ${res.status} ${res.statusText}` - ) as GitHubApiError; - - err.status = res.status; - err.url = url; - err.headers = Object.fromEntries(res.headers.entries()); - try { - err.bodyText = await res.text(); - } catch { - err.bodyText = undefined; - } - - throw err; - } - - return (await res.json()) as T; - } - - private _getApiBase() { - return this._clientOptions?.apiBase ?? defaultGitHubApiBase; - } -} diff --git a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts index c1e427c2..a28974cc 100644 --- a/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts +++ b/test/modelDependent/functionary/functionaryModelGpuLayersOptions.test.ts @@ -7,10 +7,11 @@ import {defaultLlamaVramPadding} from "../../../src/bindings/getLlama.js"; import {BuildGpu} from "../../../src/bindings/types.js"; describe("functionary", () => { - describe("model options", () => { - describe("Resolve the correct number of GPU layers", async () => { + describe("model options", async () => { + const llama = await getTestLlama(); + + describe.skipIf(llama.gpu === false)("Resolve the correct number of GPU layers", async () => { const modelPath = await getModelFile("functionary-small-v2.5.Q4_0.gguf"); - const llama = await getTestLlama(); const fileInfo = await readGgufFileInfo(modelPath); const ggufInsights = await GgufInsights.from(fileInfo, llama); @@ -28,7 +29,10 @@ describe("functionary", () => { totalSwap?: number, freeSwap?: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu }) { - const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, { + const { + gpuLayers: resolvedGpuLayers, + useMmap: resolvedUseMmap + } = await ggufInsights.configurationResolver.resolveModelGpuLayersV2(gpuLayers, { ignoreMemorySafetyChecks, getVramState: async () => ({ total: llamaGpu === false ? 0 : totalVram, @@ -37,13 +41,13 @@ describe("functionary", () => { llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram), llamaGpu, llamaSupportsGpuOffloading: llamaGpu !== false, - useMmap: true + useMmap: "auto" }); async function resolveAutoContextSize() { const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({ targetGpuLayers: resolvedGpuLayers, - useMmap: true + useMmap: resolvedUseMmap }, { llamaGpu, getVramState: async () => ({ @@ -71,7 +75,8 @@ describe("functionary", () => { return { gpuLayers: resolvedGpuLayers, - contextSize: await resolveAutoContextSize() + contextSize: await resolveAutoContextSize(), + useMmap: resolvedUseMmap }; } @@ -84,6 +89,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -92,6 +98,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -102,6 +109,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -114,7 +122,8 @@ describe("functionary", () => { freeRam: s1GB * 6 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -125,6 +134,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -137,6 +147,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -152,6 +163,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -163,7 +175,8 @@ describe("functionary", () => { freeSwap: s1GB * 1 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -178,6 +191,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); @@ -191,6 +205,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -233,6 +248,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -243,6 +259,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -255,7 +272,8 @@ describe("functionary", () => { freeRam: s1GB * 4.5 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -307,6 +325,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -318,7 +337,8 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -333,6 +353,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -343,7 +364,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 7.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("1792"); + expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -354,7 +376,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 5.3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("5632"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -410,6 +433,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -422,7 +446,8 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -438,6 +463,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(16, { @@ -497,6 +523,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -511,6 +538,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); @@ -524,6 +552,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(32, { @@ -542,6 +571,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -552,6 +582,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -562,6 +593,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -575,6 +607,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(32, { @@ -597,6 +630,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -608,7 +642,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -619,7 +654,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.contextSize).to.toMatchInlineSnapshot("397"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -632,6 +668,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); @@ -664,6 +701,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -674,6 +712,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -684,6 +723,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -697,6 +737,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -707,6 +748,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -717,6 +759,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(33, { @@ -739,6 +782,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -751,6 +795,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -761,7 +806,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.contextSize).to.toMatchInlineSnapshot("397"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -772,7 +818,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -783,7 +830,8 @@ describe("functionary", () => { llamaGpu: false }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -795,7 +843,8 @@ describe("functionary", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("7680"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -809,7 +858,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -820,7 +870,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("3072"); + expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -831,7 +882,8 @@ describe("functionary", () => { unifiedMemorySize: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.contextSize).to.toMatchInlineSnapshot("3584"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(33, { @@ -856,6 +908,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } }); }); @@ -899,6 +952,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -908,7 +962,8 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("501"); + expect(res.contextSize).to.toMatchInlineSnapshot("512"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -918,12 +973,13 @@ describe("functionary", () => { freeRam: s1GB * 1 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); describe('attempts to resolve "auto"', () => { - test("8GB RAM", async () => { + test("8GB RAM", {timeout: 1000 * 60}, async () => { { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, @@ -933,6 +989,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -943,6 +1000,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -953,6 +1011,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -963,6 +1022,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -973,6 +1033,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("11"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -981,8 +1042,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -991,8 +1053,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1001,8 +1064,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1011,8 +1075,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1021,8 +1086,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1031,8 +1097,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1041,8 +1108,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1051,8 +1119,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1062,7 +1131,8 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("7168"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1072,7 +1142,8 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1083,10 +1154,11 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - test("5GB RAM", async () => { + test("5GB RAM", {timeout: 1000 * 60}, async () => { { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, @@ -1095,7 +1167,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1105,7 +1178,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1115,7 +1189,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("5120"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1126,6 +1201,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1136,6 +1212,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("11"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1144,8 +1221,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("14"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1154,8 +1232,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("15"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1164,8 +1243,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1174,8 +1254,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1184,8 +1265,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -1194,8 +1276,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1204,8 +1287,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("1280"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1214,8 +1298,9 @@ describe("functionary", () => { totalRam: s1GB * 5, freeRam: s1GB * 5 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("25"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1225,7 +1310,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("7168"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1235,7 +1321,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -1246,12 +1333,13 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); describe("attempts to resolve {min?: number, max?: number}", () => { - test("8GB RAM", async () => { + test("8GB RAM", {timeout: 1000 * 60}, async () => { { const res = await resolveGpuLayers({max: 4}, { totalVram: s1GB * 6, @@ -1261,6 +1349,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -1271,6 +1360,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 2}, { @@ -1304,6 +1394,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 16}, { @@ -1324,8 +1415,9 @@ describe("functionary", () => { freeRam: s1GB * 8 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1336,8 +1428,9 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1349,11 +1442,12 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - test("5GB RAM", async () => { + test("5GB RAM", {timeout: 1000 * 60}, async () => { { const res = await resolveGpuLayers({max: 4}, { totalVram: s1GB * 6, @@ -1362,7 +1456,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -1372,7 +1467,8 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.eql(0); - expect(res.contextSize).to.toMatchInlineSnapshot("2048"); + expect(res.contextSize).to.toMatchInlineSnapshot("1024"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 2}, { @@ -1406,6 +1502,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.eql(16); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 16}, { @@ -1426,8 +1523,9 @@ describe("functionary", () => { freeRam: s1GB * 5 }); expect(res.gpuLayers).to.be.gte(16); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1438,8 +1536,9 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -1451,7 +1550,8 @@ describe("functionary", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("16"); - expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); }); @@ -1469,6 +1569,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1479,8 +1580,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1491,8 +1593,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("9"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1503,8 +1606,9 @@ describe("functionary", () => { totalRam: s1GB * 8, freeRam: s1GB * 8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1517,6 +1621,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1529,6 +1634,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1558,6 +1664,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1568,8 +1675,9 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("21"); - expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1580,8 +1688,9 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("9"); + expect(res.contextSize).to.toMatchInlineSnapshot("7424"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1592,8 +1701,9 @@ describe("functionary", () => { totalRam: s1GB * 7, freeRam: s1GB * 7 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("18"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("20"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1606,6 +1716,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("7"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -1618,6 +1729,7 @@ describe("functionary", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { diff --git a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap index c69768c2..5b57f7a2 100644 --- a/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap +++ b/test/modelDependent/functionary/gguf/__snapshots__/ggufParser.test.ts.snap @@ -63,6 +63,7 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = ` "offset": 328548352, }, ], + "infoEndOffset": 7836512, "metadata": { "general": { "architecture": "llama", @@ -156,6 +157,23 @@ exports[`gguf > parser > should fetch GGUF metadata 1`] = ` }, }, "metadataSize": 7819208, + "source": { + "path": { + "_type": "path", + "path": "/functionary-small-v2.5.Q4_0.gguf", + }, + "type": "path", + }, + "sourceData": [ + { + "length": 7836512, + "path": { + "_type": "path", + "path": "/functionary-small-v2.5.Q4_0.gguf", + }, + "type": "path", + }, + ], "splicedParts": 1, "tensorCount": 291, "tensorInfo": [ @@ -274,6 +292,7 @@ exports[`gguf > parser > should parse local gguf model 1`] = ` "offset": 328548352, }, ], + "infoEndOffset": 7836512, "metadata": { "general": { "architecture": "llama", @@ -367,6 +386,16 @@ exports[`gguf > parser > should parse local gguf model 1`] = ` }, }, "metadataSize": 7819208, + "sourceData": [ + { + "length": 7836512, + "path": { + "_type": "path", + "path": "/functionary-small-v2.5.Q4_0.gguf", + }, + "type": "path", + }, + ], "splicedParts": 1, "tensorCount": 291, "tensorInfo": [ diff --git a/test/modelDependent/functionary/gguf/ggufInsights.test.ts b/test/modelDependent/functionary/gguf/ggufInsights.test.ts index fcd56a3a..82f19515 100644 --- a/test/modelDependent/functionary/gguf/ggufInsights.test.ts +++ b/test/modelDependent/functionary/gguf/ggufInsights.test.ts @@ -38,32 +38,32 @@ describe("gguf", async () => { `); expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 1}))).toMatchInlineSnapshot(` { - "cpuRam": "4.22GB", - "gpuVram": "528.01MB", + "cpuRam": "3.93GB", + "gpuVram": "442.52MB", } `); expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 8}))).toMatchInlineSnapshot(` { - "cpuRam": "3.42GB", - "gpuVram": "1.32GB", + "cpuRam": "3.13GB", + "gpuVram": "1.2GB", } `); expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 16}))).toMatchInlineSnapshot(` { - "cpuRam": "2.51GB", - "gpuVram": "2.34GB", + "cpuRam": "2.22GB", + "gpuVram": "2.23GB", } `); expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 24}))).toMatchInlineSnapshot(` { - "cpuRam": "1.59GB", - "gpuVram": "3.14GB", + "cpuRam": "1.3GB", + "gpuVram": "3.03GB", } `); expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 32}))).toMatchInlineSnapshot(` { - "cpuRam": "692.8MB", - "gpuVram": "4.06GB", + "cpuRam": "398.84MB", + "gpuVram": "3.94GB", } `); expect(makeEstimationReadable(ggufInsights.estimateModelResourceRequirements({gpuLayers: 33}))).toMatchInlineSnapshot(` @@ -168,7 +168,7 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.75GB", + "cpuRam": "1.76GB", "gpuVram": "0B", } `); @@ -213,8 +213,8 @@ describe("gguf", async () => { batchSize: 512 }))).toMatchInlineSnapshot(` { - "cpuRam": "1.74GB", - "gpuVram": "266.78MB", + "cpuRam": "1.75GB", + "gpuVram": "267.03MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -259,7 +259,7 @@ describe("gguf", async () => { }))).toMatchInlineSnapshot(` { "cpuRam": "1.03GB", - "gpuVram": "990.98MB", + "gpuVram": "994.98MB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -304,7 +304,7 @@ describe("gguf", async () => { }))).toMatchInlineSnapshot(` { "cpuRam": "282.5MB", - "gpuVram": "1.72GB", + "gpuVram": "1.73GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ @@ -349,7 +349,7 @@ describe("gguf", async () => { }))).toMatchInlineSnapshot(` { "cpuRam": "250.5MB", - "gpuVram": "1.75GB", + "gpuVram": "1.76GB", } `); expect(makeEstimationReadable(ggufInsights.estimateContextResourceRequirements({ diff --git a/test/modelDependent/gemma4-e2b/functions.test.ts b/test/modelDependent/gemma4-e2b/functions.test.ts new file mode 100644 index 00000000..1a17245d --- /dev/null +++ b/test/modelDependent/gemma4-e2b/functions.test.ts @@ -0,0 +1,60 @@ +import {describe, expect, test} from "vitest"; +import {defineChatSessionFunction, Gemma4ChatWrapper, LlamaChatSession} from "../../../src/index.js"; +import {getModelFile} from "../../utils/modelFiles.js"; +import {getTestLlama} from "../../utils/getTestLlama.js"; + +describe("gemma4 e2b", () => { + describe("functions", () => { + test("auto-resolves Gemma4ChatWrapper and can call a function", {timeout: 1000 * 60 * 60 * 2}, async () => { + const modelPath = await getModelFile("gemma-4-E2B-it-Q4_K_M.gguf"); + const llama = await getTestLlama(); + + const model = await llama.loadModel({ + modelPath + }); + const context = await model.createContext({ + contextSize: 2048 + }); + const chatSession = new LlamaChatSession({ + contextSequence: context.getSequence() + }); + + expect(chatSession.chatWrapper).toBeInstanceOf(Gemma4ChatWrapper); + + let functionCallCount = 0; + const promptOptions: Parameters[1] = { + maxTokens: 200, + functions: { + getNthWord: defineChatSessionFunction({ + description: "Get an n-th word", + params: { + type: "object", + properties: { + n: { + enum: [1, 2, 3, 4] + } + }, + required: ["n"] + }, + handler(params) { + functionCallCount++; + return ["very", "secret", "this", "hello"][params.n - 1]; + } + }) + } + }; + const response = await chatSession.prompt("What is the second word?", promptOptions); + + expect(functionCallCount).toBeGreaterThan(0); + expect(functionCallCount).toBeLessThanOrEqual(2); + expect(response.toLowerCase()).toContain("secret"); + + const followUpResponse = await chatSession.prompt("Explain what this word means in one short sentence.", { + ...promptOptions, + maxTokens: 60 + }); + + expect(followUpResponse.length).toBeGreaterThan(10); + }); + }); +}); diff --git a/test/modelDependent/llama3.1/controlledEvaluate.test.ts b/test/modelDependent/llama3.1/controlledEvaluate.test.ts index fbaff03b..b3e917b5 100644 --- a/test/modelDependent/llama3.1/controlledEvaluate.test.ts +++ b/test/modelDependent/llama3.1/controlledEvaluate.test.ts @@ -75,11 +75,11 @@ describe("llama 3.1", () => { item.next.probabilities = new Map( [...item.next.probabilities.entries()] .slice(0, 10) - .map(([token, probability]) => [token, parseFloat(probability.toFixed(7))]) + .map(([token, probability]) => [token, simplifyFloat(probability)]) ); if (item.next?.confidence != null) - item.next.confidence = parseFloat(item.next.confidence.toFixed(7)); + item.next.confidence = simplifyFloat(item.next.confidence); return item; }); @@ -96,73 +96,73 @@ describe("llama 3.1", () => { { "next": { "probabilities": Map { - 35308 => 0.5205752, - 27096 => 0.2434221, - 11 => 0.0222422, - 198 => 0.0119651, - 863 => 0.0083929, - 374 => 0.0083748, - 1131 => 0.0068622, - 25 => 0.0062526, - 7940 => 0.0053943, - 1 => 0.0051856, + 35308 => 0.522, + 27096 => 0.243, + 11 => 0.0221, + 198 => 0.012, + 374 => 0.00837, + 863 => 0.00836, + 1131 => 0.00682, + 25 => 0.00624, + 7940 => 0.00539, + 13 => 0.00517, }, }, }, { "next": { "probabilities": Map { - 927 => 0.9811716, - 198 => 0.003379, - 6288 => 0.0032698, - 279 => 0.0006585, - 1633 => 0.0003187, - 1035 => 0.0003126, - 13 => 0.0002916, - 264 => 0.0002902, - 297 => 0.0002849, - 720 => 0.0002489, + 927 => 0.981, + 198 => 0.00338, + 6288 => 0.00328, + 279 => 0.000653, + 1633 => 0.00032, + 1035 => 0.000312, + 13 => 0.000291, + 264 => 0.000289, + 297 => 0.000283, + 720 => 0.00025, }, "token": 927, }, }, { "next": { - "confidence": 0.9307394, + "confidence": 0.931, "token": 279, }, }, { "next": { - "confidence": 0.9596596, + "confidence": 0.96, "probabilities": Map { - 16053 => 0.9596596, - 1208 => 0.0047719, - 198 => 0.0031805, - 5679 => 0.0029246, - 65536 => 0.0019735, - 6435 => 0.000917, - 2697 => 0.0006723, - 720 => 0.0005984, - 21811 => 0.0005529, - 45363 => 0.0005513, + 16053 => 0.96, + 1208 => 0.00473, + 198 => 0.00318, + 5679 => 0.0029, + 65536 => 0.00197, + 6435 => 0.000912, + 2697 => 0.000666, + 720 => 0.000598, + 21811 => 0.000549, + 45363 => 0.000549, }, }, }, { "next": { - "confidence": 0.9871598, + "confidence": 0.987, "probabilities": Map { - 5679 => 0.9871598, - 21811 => 0.0014282, - 198 => 0.0009355, - 8415 => 0.0007248, - 12875 => 0.0003796, - 4194 => 0.0003463, - 720 => 0.0002809, - 14588 => 0.0002761, - 9522 => 0.0002418, - 627 => 0.0002038, + 5679 => 0.987, + 21811 => 0.00143, + 198 => 0.000937, + 8415 => 0.000724, + 12875 => 0.00038, + 4194 => 0.000344, + 720 => 0.000282, + 14588 => 0.000276, + 9522 => 0.000241, + 627 => 0.000204, }, "token": 5679, }, @@ -172,3 +172,11 @@ describe("llama 3.1", () => { }); }); }); + +function simplifyFloat(value: number) { + if (value === 0) + return 0; + + const step = 10 ** (Math.floor(Math.log10(Math.abs(value))) - 2); + return Number.parseFloat((Math.round(value / step) * step).toPrecision(12)); +} diff --git a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts index 44305d77..ec703f04 100644 --- a/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts +++ b/test/modelDependent/llama3.1/evaluateWithMetadata.test.ts @@ -98,151 +98,151 @@ describe("llama 3.1", () => { [ { "probabilities": Map { - 578 => 0.4305387, - 1115 => 0.130273, - 1102 => 0.0517783, - 763 => 0.0429566, - 1283 => 0.0294619, - 2100 => 0.0294103, - 15636 => 0.0263193, - 2030 => 0.0218532, - 320 => 0.0168992, - 1628 => 0.011877, + 578 => 0.43, + 1115 => 0.13, + 1102 => 0.0516, + 763 => 0.0429, + 2100 => 0.0294, + 1283 => 0.0294, + 15636 => 0.0263, + 2030 => 0.0219, + 320 => 0.0169, + 1628 => 0.0119, }, "token": 578, }, { "probabilities": Map { - 16053 => 0.4229744, - 4062 => 0.303401, - 39935 => 0.0602281, - 2944 => 0.0372685, - 5679 => 0.0237816, - 11914 => 0.0162851, - 2144 => 0.0146596, - 1121 => 0.0069732, - 17571 => 0.0057899, - 3446 => 0.0049125, + 16053 => 0.422, + 4062 => 0.304, + 39935 => 0.0603, + 2944 => 0.0376, + 5679 => 0.0237, + 11914 => 0.0161, + 2144 => 0.0147, + 1121 => 0.00699, + 17571 => 0.00574, + 3446 => 0.00492, }, "token": 16053, }, { "probabilities": Map { - 5679 => 0.9981223, - 12875 => 0.0001592, - 18964 => 0.0001154, - 39935 => 0.0001146, - 13 => 0.0001047, - 627 => 0.0000926, + 5679 => 0.998, + 12875 => 0.000159, + 18964 => 0.000115, + 39935 => 0.000115, + 13 => 0.000105, + 627 => 0.0000928, 656 => 0.0000625, 893 => 0.0000563, 198 => 0.0000522, - 374 => 0.0000518, + 374 => 0.0000521, }, "token": 5679, }, { "probabilities": Map { - 374 => 0.8128683, - 1587 => 0.0480889, - 596 => 0.0247298, - 1120 => 0.0222965, - 3250 => 0.0215258, - 706 => 0.0161501, - 15849 => 0.0086884, - 1053 => 0.0059099, - 55064 => 0.0037784, - 11 => 0.0036557, + 374 => 0.812, + 1587 => 0.0482, + 596 => 0.0247, + 1120 => 0.0224, + 3250 => 0.0216, + 706 => 0.0162, + 15849 => 0.00871, + 1053 => 0.00591, + 55064 => 0.00378, + 11 => 0.00368, }, "token": 374, }, { "probabilities": Map { - 2288 => 0.2759203, - 1120 => 0.166673, - 539 => 0.1576579, - 779 => 0.1335195, - 264 => 0.055744, - 1101 => 0.0292486, - 16053 => 0.0176843, - 5042 => 0.0158506, - 1193 => 0.0146031, - 2744 => 0.0140961, + 2288 => 0.276, + 1120 => 0.167, + 539 => 0.158, + 779 => 0.133, + 264 => 0.0558, + 1101 => 0.0293, + 16053 => 0.0178, + 5042 => 0.0159, + 1193 => 0.0146, + 2744 => 0.0141, }, "token": 2288, }, { "probabilities": Map { - 16053 => 0.9066879, - 13326 => 0.0635879, - 19781 => 0.0071462, - 17551 => 0.0020222, - 10968 => 0.0012692, - 11920 => 0.0011004, - 6435 => 0.0010057, - 34386 => 0.0007741, - 1208 => 0.0006092, - 25366 => 0.0005664, + 16053 => 0.907, + 13326 => 0.0635, + 19781 => 0.00713, + 17551 => 0.00202, + 10968 => 0.00126, + 11920 => 0.0011, + 6435 => 0.001, + 34386 => 0.000775, + 1208 => 0.000609, + 25366 => 0.000566, }, "token": 16053, }, { "probabilities": Map { - 311 => 0.9882948, - 1524 => 0.0061879, - 11 => 0.002568, - 323 => 0.000522, - 13 => 0.0003525, - 627 => 0.0003204, - 1606 => 0.0002628, + 311 => 0.988, + 1524 => 0.00617, + 11 => 0.00258, + 323 => 0.000525, + 13 => 0.000354, + 627 => 0.000322, + 1606 => 0.000265, 2288 => 0.000258, - 369 => 0.0001243, - 320 => 0.0001019, + 369 => 0.000125, + 320 => 0.000102, }, "token": 311, }, { "probabilities": Map { - 2512 => 0.749257, - 1524 => 0.0991107, - 656 => 0.0322866, - 636 => 0.0240931, - 7940 => 0.014378, - 33586 => 0.0108598, - 387 => 0.0086719, - 1781 => 0.0058546, - 1629 => 0.0054801, - 3351 => 0.0051043, + 2512 => 0.75, + 1524 => 0.0987, + 656 => 0.0324, + 636 => 0.0241, + 7940 => 0.0144, + 33586 => 0.0109, + 387 => 0.00867, + 1781 => 0.00585, + 1629 => 0.00549, + 3351 => 0.00512, }, "token": 2512, }, { "probabilities": Map { - 922 => 0.9522551, - 1606 => 0.0149839, - 11 => 0.0139898, - 430 => 0.002966, - 627 => 0.0023101, - 13 => 0.0018821, - 1524 => 0.0018027, - 369 => 0.0017665, - 323 => 0.0009226, - 382 => 0.0008453, + 922 => 0.952, + 1606 => 0.015, + 11 => 0.014, + 430 => 0.00297, + 627 => 0.00232, + 13 => 0.00189, + 1524 => 0.0018, + 369 => 0.00177, + 323 => 0.000927, + 382 => 0.000848, }, "token": 922, }, { "probabilities": Map { - 279 => 0.6508359, - 4205 => 0.3128611, - 1148 => 0.0113738, - 1690 => 0.0044254, - 904 => 0.0030366, - 1202 => 0.0026803, - 264 => 0.0011148, - 1790 => 0.0010861, - 813 => 0.0010576, - 1524 => 0.0007703, + 279 => 0.652, + 4205 => 0.312, + 1148 => 0.0114, + 1690 => 0.00443, + 904 => 0.00304, + 1202 => 0.00267, + 264 => 0.00111, + 1790 => 0.00108, + 813 => 0.00105, + 1524 => 0.000764, }, "token": 279, }, @@ -280,43 +280,43 @@ describe("llama 3.1", () => { expect(res).toMatchInlineSnapshot(` [ { - "confidence": 0.4305387, + "confidence": 0.43, "token": 578, }, { - "confidence": 0.4229744, + "confidence": 0.422, "token": 16053, }, { - "confidence": 0.9981223, + "confidence": 0.998, "token": 5679, }, { - "confidence": 0.8128683, + "confidence": 0.812, "token": 374, }, { - "confidence": 0.2759203, + "confidence": 0.276, "token": 2288, }, { - "confidence": 0.9066879, + "confidence": 0.907, "token": 16053, }, { - "confidence": 0.9882948, + "confidence": 0.988, "token": 311, }, { - "confidence": 0.749257, + "confidence": 0.75, "token": 2512, }, { - "confidence": 0.9522551, + "confidence": 0.952, "token": 922, }, { - "confidence": 0.6508359, + "confidence": 0.652, "token": 279, }, ] @@ -353,162 +353,162 @@ describe("llama 3.1", () => { expect(res).toMatchInlineSnapshot(` [ { - "confidence": 0.4305387, + "confidence": 0.43, "probabilities": Map { - 578 => 0.4305387, - 1115 => 0.130273, - 1102 => 0.0517783, - 763 => 0.0429566, - 1283 => 0.0294619, - 2100 => 0.0294103, - 15636 => 0.0263193, - 2030 => 0.0218532, - 320 => 0.0168992, - 1628 => 0.011877, + 578 => 0.43, + 1115 => 0.13, + 1102 => 0.0516, + 763 => 0.0429, + 2100 => 0.0294, + 1283 => 0.0294, + 15636 => 0.0263, + 2030 => 0.0219, + 320 => 0.0169, + 1628 => 0.0119, }, "token": 578, }, { - "confidence": 0.4229744, + "confidence": 0.422, "probabilities": Map { - 16053 => 0.4229744, - 4062 => 0.303401, - 39935 => 0.0602281, - 2944 => 0.0372685, - 5679 => 0.0237816, - 11914 => 0.0162851, - 2144 => 0.0146596, - 1121 => 0.0069732, - 17571 => 0.0057899, - 3446 => 0.0049125, + 16053 => 0.422, + 4062 => 0.304, + 39935 => 0.0603, + 2944 => 0.0376, + 5679 => 0.0237, + 11914 => 0.0161, + 2144 => 0.0147, + 1121 => 0.00699, + 17571 => 0.00574, + 3446 => 0.00492, }, "token": 16053, }, { - "confidence": 0.9981223, + "confidence": 0.998, "probabilities": Map { - 5679 => 0.9981223, - 12875 => 0.0001592, - 18964 => 0.0001154, - 39935 => 0.0001146, - 13 => 0.0001047, - 627 => 0.0000926, + 5679 => 0.998, + 12875 => 0.000159, + 18964 => 0.000115, + 39935 => 0.000115, + 13 => 0.000105, + 627 => 0.0000928, 656 => 0.0000625, 893 => 0.0000563, 198 => 0.0000522, - 374 => 0.0000518, + 374 => 0.0000521, }, "token": 5679, }, { - "confidence": 0.8128683, + "confidence": 0.812, "probabilities": Map { - 374 => 0.8128683, - 1587 => 0.0480889, - 596 => 0.0247298, - 1120 => 0.0222965, - 3250 => 0.0215258, - 706 => 0.0161501, - 15849 => 0.0086884, - 1053 => 0.0059099, - 55064 => 0.0037784, - 11 => 0.0036557, + 374 => 0.812, + 1587 => 0.0482, + 596 => 0.0247, + 1120 => 0.0224, + 3250 => 0.0216, + 706 => 0.0162, + 15849 => 0.00871, + 1053 => 0.00591, + 55064 => 0.00378, + 11 => 0.00368, }, "token": 374, }, { - "confidence": 0.2759203, + "confidence": 0.276, "probabilities": Map { - 2288 => 0.2759203, - 1120 => 0.166673, - 539 => 0.1576579, - 779 => 0.1335195, - 264 => 0.055744, - 1101 => 0.0292486, - 16053 => 0.0176843, - 5042 => 0.0158506, - 1193 => 0.0146031, - 2744 => 0.0140961, + 2288 => 0.276, + 1120 => 0.167, + 539 => 0.158, + 779 => 0.133, + 264 => 0.0558, + 1101 => 0.0293, + 16053 => 0.0178, + 5042 => 0.0159, + 1193 => 0.0146, + 2744 => 0.0141, }, "token": 2288, }, { - "confidence": 0.9066879, + "confidence": 0.907, "probabilities": Map { - 16053 => 0.9066879, - 13326 => 0.0635879, - 19781 => 0.0071462, - 17551 => 0.0020222, - 10968 => 0.0012692, - 11920 => 0.0011004, - 6435 => 0.0010057, - 34386 => 0.0007741, - 1208 => 0.0006092, - 25366 => 0.0005664, + 16053 => 0.907, + 13326 => 0.0635, + 19781 => 0.00713, + 17551 => 0.00202, + 10968 => 0.00126, + 11920 => 0.0011, + 6435 => 0.001, + 34386 => 0.000775, + 1208 => 0.000609, + 25366 => 0.000566, }, "token": 16053, }, { - "confidence": 0.9882948, + "confidence": 0.988, "probabilities": Map { - 311 => 0.9882948, - 1524 => 0.0061879, - 11 => 0.002568, - 323 => 0.000522, - 13 => 0.0003525, - 627 => 0.0003204, - 1606 => 0.0002628, + 311 => 0.988, + 1524 => 0.00617, + 11 => 0.00258, + 323 => 0.000525, + 13 => 0.000354, + 627 => 0.000322, + 1606 => 0.000265, 2288 => 0.000258, - 369 => 0.0001243, - 320 => 0.0001019, + 369 => 0.000125, + 320 => 0.000102, }, "token": 311, }, { - "confidence": 0.749257, + "confidence": 0.75, "probabilities": Map { - 2512 => 0.749257, - 1524 => 0.0991107, - 656 => 0.0322866, - 636 => 0.0240931, - 7940 => 0.014378, - 33586 => 0.0108598, - 387 => 0.0086719, - 1781 => 0.0058546, - 1629 => 0.0054801, - 3351 => 0.0051043, + 2512 => 0.75, + 1524 => 0.0987, + 656 => 0.0324, + 636 => 0.0241, + 7940 => 0.0144, + 33586 => 0.0109, + 387 => 0.00867, + 1781 => 0.00585, + 1629 => 0.00549, + 3351 => 0.00512, }, "token": 2512, }, { - "confidence": 0.9522551, + "confidence": 0.952, "probabilities": Map { - 922 => 0.9522551, - 1606 => 0.0149839, - 11 => 0.0139898, - 430 => 0.002966, - 627 => 0.0023101, - 13 => 0.0018821, - 1524 => 0.0018027, - 369 => 0.0017665, - 323 => 0.0009226, - 382 => 0.0008453, + 922 => 0.952, + 1606 => 0.015, + 11 => 0.014, + 430 => 0.00297, + 627 => 0.00232, + 13 => 0.00189, + 1524 => 0.0018, + 369 => 0.00177, + 323 => 0.000927, + 382 => 0.000848, }, "token": 922, }, { - "confidence": 0.6508359, + "confidence": 0.652, "probabilities": Map { - 279 => 0.6508359, - 4205 => 0.3128611, - 1148 => 0.0113738, - 1690 => 0.0044254, - 904 => 0.0030366, - 1202 => 0.0026803, - 264 => 0.0011148, - 1790 => 0.0010861, - 813 => 0.0010576, - 1524 => 0.0007703, + 279 => 0.652, + 4205 => 0.312, + 1148 => 0.0114, + 1690 => 0.00443, + 904 => 0.00304, + 1202 => 0.00267, + 264 => 0.00111, + 1790 => 0.00108, + 813 => 0.00105, + 1524 => 0.000764, }, "token": 279, }, @@ -568,10 +568,18 @@ function simplifyRes [token, parseFloat(probability.toFixed(7))]) + .map(([token, probability]) => [token, simplifyFloat(probability)]) ); if (item.confidence != null) - item.confidence = parseFloat(item.confidence.toFixed(7)); + item.confidence = simplifyFloat(item.confidence); } } + +function simplifyFloat(value: number) { + if (value === 0) + return 0; + + const step = 10 ** (Math.floor(Math.log10(Math.abs(value))) - 2); + return Number.parseFloat((Math.round(value / step) * step).toPrecision(12)); +} diff --git a/test/modelDependent/llama3.2/promptCompletion.test.ts b/test/modelDependent/llama3.2/promptCompletion.test.ts index 8c76051d..9df6b337 100644 --- a/test/modelDependent/llama3.2/promptCompletion.test.ts +++ b/test/modelDependent/llama3.2/promptCompletion.test.ts @@ -24,7 +24,7 @@ describe("llama 3.2", () => { chatWrapper: resolveChatWrapper(model, { customWrapperSettings: { "llama3.2-lightweight": { - todayDate: new Date("2025-01-01T00:00:00Z") + todayDate: new Date("2025-01-01T00:00:00") } } }) @@ -34,7 +34,7 @@ describe("llama 3.2", () => { chatWrapper: resolveChatWrapper(model, { customWrapperSettings: { "llama3.2-lightweight": { - todayDate: new Date("2025-01-01T00:00:00Z") + todayDate: new Date("2025-01-01T00:00:00") } } }) diff --git a/test/modelDependent/llama3.2/sequenceState.test.ts b/test/modelDependent/llama3.2/sequenceState.test.ts index c3e71ffe..a7b67b49 100644 --- a/test/modelDependent/llama3.2/sequenceState.test.ts +++ b/test/modelDependent/llama3.2/sequenceState.test.ts @@ -1,6 +1,6 @@ import {describe, expect, test} from "vitest"; import fs from "fs-extra"; -import {LlamaChatSession, TokenMeter} from "../../../src/index.js"; +import {LlamaChatSession, resolveChatWrapper, TokenMeter} from "../../../src/index.js"; import {getModelFile} from "../../utils/modelFiles.js"; import {getTestLlama} from "../../utils/getTestLlama.js"; import {getTempTestFilePath} from "../../utils/helpers/getTempTestDir.js"; @@ -8,6 +8,8 @@ import {toBytes} from "../../../src/cli/utils/toBytes.js"; describe("llama 3.2", () => { describe("chatSession", () => { + const todayDate = new Date("2026-04-28T00:00:00Z"); + describe("sequence state", () => { test("save and load a state works properly", {timeout: 1000 * 60 * 60 * 2}, async (test) => { const modelPath = await getModelFile("Llama-3.2-3B-Instruct.Q4_K_M.gguf"); @@ -24,10 +26,12 @@ describe("llama 3.2", () => { const contextSequence2 = context.getSequence(); const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const [ @@ -37,8 +41,8 @@ describe("llama 3.2", () => { chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}), chatSession2.prompt("Remember: giraffes are not elephants", {maxTokens: 5}) ]); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); - expect(res2).to.toMatchInlineSnapshot('"I appreciate the reminder."'); + expect(res1).to.toMatchInlineSnapshot("\"That's a common\""); + expect(res2).to.match(/I appreciate the reminder|I'll keep that in/); const stateFile1Path = await getTempTestFilePath("state1"); @@ -73,7 +77,8 @@ describe("llama 3.2", () => { `); const chatSession1_1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1_1 = await chatSession1_1.prompt("What's the exact thing I told you to remember?", {maxTokens: 10}); expect(res1_1).to.toMatchInlineSnapshot("\"You didn't tell me to remember anything. This\""); @@ -102,7 +107,8 @@ describe("llama 3.2", () => { `); const chatSession1_2 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession1_2.setChatHistory(chatSession1.getChatHistory()); const res1_2 = await chatSession1_2.prompt("What's the exact thing I told you to remember?", {maxTokens: 12}); @@ -136,11 +142,12 @@ describe("llama 3.2", () => { const contextSequence2 = context2.getSequence(); const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); + expect(res1).to.toMatchInlineSnapshot("\"That's a common\""); const stateFile1Path = await getTempTestFilePath("state1"); @@ -163,7 +170,8 @@ describe("llama 3.2", () => { const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession2.setChatHistory(chatSession1.getChatHistory()); await contextSequence2.loadStateFromFile(stateFile1Path, {acceptRisk: true}); @@ -198,11 +206,12 @@ describe("llama 3.2", () => { expect(context2.contextSize).to.eql(256); // the context is actually bigger due to `llama.cpp`'s padding const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1 = await chatSession1.prompt("Remember: locks are not doors. Also, write a long poem about it", {maxTokens: 154}); - expect(res1).toMatch(/^(A clever reminder indeed.|A clever reminder, indeed.|A wise phrase to ponder|A wise phrase indeed)/); + expect(res1).toMatch(/^(A clever reminder indeed.|A clever reminder, indeed.|A wise phrase to ponder|A wise phrase indeed|A poetic reminder|A poetic task)/); const stateFile1Path = await getTempTestFilePath("state1"); @@ -225,7 +234,8 @@ describe("llama 3.2", () => { const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession2.setChatHistory(chatSession1.getChatHistory()); try { @@ -256,11 +266,12 @@ describe("llama 3.2", () => { const contextSequence2 = context2.getSequence(); const chatSession1 = new LlamaChatSession({ - contextSequence: contextSequence1 + contextSequence: contextSequence1, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); const res1 = await chatSession1.prompt("Remember: locks are not doors", {maxTokens: 4}); - expect(res1).to.toMatchInlineSnapshot("\"That's a clever\""); + expect(res1).to.toMatchInlineSnapshot("\"That's a common\""); const stateFile1Path = await getTempTestFilePath("state1"); @@ -283,7 +294,8 @@ describe("llama 3.2", () => { const chatSession2 = new LlamaChatSession({ - contextSequence: contextSequence2 + contextSequence: contextSequence2, + chatWrapper: resolveChatWrapper(model, {customWrapperSettings: {"llama3.2-lightweight": {todayDate}}}) }); chatSession2.setChatHistory(chatSession1.getChatHistory()); try { diff --git a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts index 6b88dc77..a03dc4c5 100644 --- a/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts +++ b/test/modelDependent/stableCode/stableCodeModelGpuLayersOptions.test.ts @@ -7,10 +7,11 @@ import {BuildGpu} from "../../../src/bindings/types.js"; import {defaultLlamaVramPadding} from "../../../src/bindings/getLlama.js"; describe("stableCode", () => { - describe("model options", () => { - describe("Resolve the correct number of GPU layers", async () => { + describe("model options", async () => { + const llama = await getTestLlama(); + + describe.skipIf(llama.gpu === false)("Resolve the correct number of GPU layers", async (scope) => { const modelPath = await getModelFile("stable-code-3b-Q5_K_M.gguf"); - const llama = await getTestLlama(); const fileInfo = await readGgufFileInfo(modelPath); const ggufInsights = await GgufInsights.from(fileInfo, llama); @@ -28,7 +29,7 @@ describe("stableCode", () => { totalSwap?: number, freeSwap?: number, ignoreMemorySafetyChecks?: boolean, llamaGpu?: BuildGpu }) { - const resolvedGpuLayers = await ggufInsights.configurationResolver.resolveModelGpuLayers(gpuLayers, { + const {gpuLayers: resolvedGpuLayers, useMmap: resolvedUseMmap} = await ggufInsights.configurationResolver.resolveModelGpuLayersV2(gpuLayers, { ignoreMemorySafetyChecks, getVramState: async () => ({ total: llamaGpu === false ? 0 : totalVram, @@ -37,13 +38,13 @@ describe("stableCode", () => { llamaVramPaddingSize: defaultLlamaVramPadding(llamaGpu === false ? 0 : totalVram), llamaGpu, llamaSupportsGpuOffloading: llamaGpu !== false, - useMmap: true + useMmap: "auto" }); async function resolveAutoContextSize() { const resolvedConfig = await ggufInsights.configurationResolver.resolveAndScoreConfig({ targetGpuLayers: resolvedGpuLayers, - useMmap: true + useMmap: resolvedUseMmap }, { llamaGpu, getVramState: async () => ({ @@ -71,7 +72,8 @@ describe("stableCode", () => { return { gpuLayers: resolvedGpuLayers, - contextSize: await resolveAutoContextSize() + contextSize: await resolveAutoContextSize(), + useMmap: resolvedUseMmap }; } @@ -83,6 +85,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers(0, { @@ -91,6 +94,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -101,6 +105,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -111,7 +116,8 @@ describe("stableCode", () => { freeVram: s1GB * 3 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("8960"); + expect(res.contextSize).to.toMatchInlineSnapshot("13824"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } try { await resolveGpuLayers(16, { @@ -142,7 +148,8 @@ describe("stableCode", () => { ignoreMemorySafetyChecks: true }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("179"); + expect(res.contextSize).to.toMatchInlineSnapshot("2816"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } @@ -154,6 +161,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(16, { @@ -164,6 +172,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -174,7 +183,8 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(32); - expect(res.contextSize).to.toMatchInlineSnapshot("11520"); + expect(res.contextSize).to.toMatchInlineSnapshot("14080"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(32, { @@ -193,6 +203,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(32); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -203,6 +214,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(32, { @@ -213,6 +225,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -223,7 +236,8 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("11264"); + expect(res.contextSize).to.toMatchInlineSnapshot("13312"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers(33, { @@ -242,6 +256,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { @@ -252,6 +267,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers(33, { @@ -262,6 +278,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); @@ -304,6 +321,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(33); expect(res.contextSize).to.toMatchInlineSnapshot("null"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -311,7 +329,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -319,7 +338,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.4 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("max", { @@ -327,11 +347,12 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.eql(33); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("9472"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - it('attempts to resolve "auto"', async () => { + it('attempts to resolve "auto"', {timeout: 1000 * 60}, async () => { { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, @@ -339,30 +360,34 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 0.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("14848"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("1"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 0.8 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("2"); - expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); + expect(res.contextSize).to.toMatchInlineSnapshot("15104"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { totalVram: s1GB * 6, freeVram: s1GB * 1.4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); - expect(res.contextSize).to.toMatchInlineSnapshot("8960"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("10"); + expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } { const res = await resolveGpuLayers("auto", { @@ -371,6 +396,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); expect(res.contextSize).to.toMatchInlineSnapshot("1536"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -378,7 +404,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3328"); + expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -386,7 +413,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("3840"); + expect(res.contextSize).to.toMatchInlineSnapshot("4608"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -394,7 +422,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("4352"); + expect(res.contextSize).to.toMatchInlineSnapshot("5120"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -402,7 +431,8 @@ describe("stableCode", () => { freeVram: s1GB * 3.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5376"); + expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -410,7 +440,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -418,7 +449,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.3 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("6656"); + expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -426,7 +458,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.5 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7168"); + expect(res.contextSize).to.toMatchInlineSnapshot("8448"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -434,7 +467,8 @@ describe("stableCode", () => { freeVram: s1GB * 4.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("7936"); + expect(res.contextSize).to.toMatchInlineSnapshot("9472"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -442,7 +476,8 @@ describe("stableCode", () => { freeVram: s1GB * 5.2 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("9216"); + expect(res.contextSize).to.toMatchInlineSnapshot("10752"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -450,7 +485,8 @@ describe("stableCode", () => { freeVram: s1GB * 5.8 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("10752"); + expect(res.contextSize).to.toMatchInlineSnapshot("12800"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers("auto", { @@ -458,11 +494,12 @@ describe("stableCode", () => { freeVram: s1GB * 6 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("11264"); + expect(res.contextSize).to.toMatchInlineSnapshot("13312"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } }); - it("attempts to resolve {min?: number, max?: number}", async () => { + it("attempts to resolve {min?: number, max?: number}", {timeout: 1000 * 60}, async () => { { const res = await resolveGpuLayers({max: 4}, { totalVram: s1GB * 6, @@ -470,6 +507,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 0, max: 4}, { @@ -478,6 +516,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 2}, { @@ -504,7 +543,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.eql(16); - expect(res.contextSize).to.toMatchInlineSnapshot("14592"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } try { await resolveGpuLayers({min: 16}, { @@ -522,7 +562,8 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -532,7 +573,8 @@ describe("stableCode", () => { expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); - expect(res.contextSize).to.toMatchInlineSnapshot("8448"); + expect(res.contextSize).to.toMatchInlineSnapshot("9984"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); } { const res = await resolveGpuLayers({min: 16, max: 24}, { @@ -541,8 +583,9 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.be.gte(16); expect(res.gpuLayers).to.be.lte(24); - expect(res.gpuLayers).to.toMatchInlineSnapshot("17"); - expect(res.contextSize).to.toMatchInlineSnapshot("8192"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("19"); + expect(res.contextSize).to.toMatchInlineSnapshot("10752"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); } }); @@ -556,6 +599,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -565,7 +609,8 @@ describe("stableCode", () => { freeVram: s1GB * 4 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("33"); - expect(res.contextSize).to.toMatchInlineSnapshot("5888"); + expect(res.contextSize).to.toMatchInlineSnapshot("6912"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -574,8 +619,9 @@ describe("stableCode", () => { totalVram: s1GB * 2, freeVram: s1GB * 1 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("4"); - expect(res.contextSize).to.toMatchInlineSnapshot("6144"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("6"); + expect(res.contextSize).to.toMatchInlineSnapshot("11008"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -584,8 +630,9 @@ describe("stableCode", () => { totalVram: s1GB * 6, freeVram: s1GB * 4 }); - expect(res.gpuLayers).to.toMatchInlineSnapshot("22"); - expect(res.contextSize).to.toMatchInlineSnapshot("9216"); + expect(res.gpuLayers).to.toMatchInlineSnapshot("24"); + expect(res.contextSize).to.toMatchInlineSnapshot("11776"); + expect(res.useMmap).to.toMatchInlineSnapshot("false"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -595,7 +642,8 @@ describe("stableCode", () => { freeVram: s1GB * 1 }); expect(res.gpuLayers).to.toMatchInlineSnapshot("3"); - expect(res.contextSize).to.toMatchInlineSnapshot("11264"); + expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -606,6 +654,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.toMatchInlineSnapshot("0"); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } { @@ -627,6 +676,7 @@ describe("stableCode", () => { }); expect(res.gpuLayers).to.eql(0); expect(res.contextSize).to.toMatchInlineSnapshot("16384"); + expect(res.useMmap).to.toMatchInlineSnapshot("true"); expect(res.contextSize).to.be.gte(contextSize); } }); diff --git a/test/standalone/chatWrappers/Gemma4ChatWrapper.test.ts b/test/standalone/chatWrappers/Gemma4ChatWrapper.test.ts new file mode 100644 index 00000000..9aeb8e5f --- /dev/null +++ b/test/standalone/chatWrappers/Gemma4ChatWrapper.test.ts @@ -0,0 +1,362 @@ +import {describe, expect, test} from "vitest"; +import {ChatHistoryItem, defineChatSessionFunction, Gemma4ChatWrapper} from "../../../src/index.js"; +import {defaultChatSystemPrompt} from "../../../src/config.js"; + + +describe("Gemma4ChatWrapper", () => { + const conversationHistory: ChatHistoryItem[] = [{ + type: "system", + text: defaultChatSystemPrompt + }, { + type: "user", + text: "Hi there!" + }, { + type: "model", + response: [ + { + type: "segment", + segmentType: "thought", + text: "Let me think how to respond to this.", + ended: true + }, + "Hello!" + ] + }, { + type: "user", + text: "How are you?" + }, { + type: "model", + response: [ + { + type: "segment", + segmentType: "thought", + text: "Let me think how to answer", + ended: true + }, + { + type: "segment", + segmentType: "comment", + text: "This is a question about my state", + ended: true + }, + "I'm good, how are you?" + ] + }]; + + const functions = { + getRandomNumber: defineChatSessionFunction({ + description: "Get a random number", + params: { + type: "object", + properties: { + min: { + type: "number" + }, + max: { + type: "number" + } + } + }, + async handler(params) { + return Math.floor(Math.random() * (params.max - params.min + 1) + params.min); + } + }), + notifyOwner: defineChatSessionFunction({ + description: "Send a notification to the owner, and create sub notifications", + params: { + $ref: "#/$defs/notification", + $defs: { + notification: { + type: "object", + properties: { + message: { + type: "string" + }, + subNotifications: { + type: "array", + items: { + $ref: "#/$defs/notification" + } + } + } + } + } + }, + handler(notification) { + return "Notification created: " + notification.message; + } + }), + notifyOwner2: defineChatSessionFunction({ + description: "Send a notification to the owner, and create sub notifications", + params: { + $ref: "#/$defs/notification", + $defs: { + notification: { + type: "object", + properties: { + message: { + type: "string", + description: "Notification message" + }, + subNotifications: { + type: "array", + description: "Sub notifications", + items: { + $ref: "#/$defs/notification" + } + } + } + } + } + }, + handler(notification) { + return "Notification created: " + notification.message; + } + }), + func1: defineChatSessionFunction({ + description: "Some function", + params: { + type: "object", + properties: { + message: { + type: "string", + description: "Some message", + minLength: 3, + maxLength: 10 + }, + words: { + type: "array", + description: "Some words", + items: { + type: "string" + }, + minItems: 2, + maxItems: 5 + }, + headers: { + type: "object", + description: "Some headers", + additionalProperties: { + type: "string" + }, + minProperties: 4, + maxProperties: 12 + }, + mappings: { + type: "object", + description: "Some mappings", + properties: { + a: { + type: "boolean" + }, + b: { + type: "number" + }, + c: { + type: ["string", "null"] + } + }, + additionalProperties: { + type: "string" + }, + minProperties: 4, + maxProperties: 12 + } + } + }, + handler(params) { + + } + }) + }; + const conversationHistory2: ChatHistoryItem[] = [{ + type: "system", + text: defaultChatSystemPrompt + }, { + type: "user", + text: "Hi there!" + }, { + type: "model", + response: ["Hello!"] + }, { + type: "user", + text: "Role a dice twice and tell me the total result" + }, { + type: "model", + response: [ + { + type: "functionCall", + name: "getRandomNumber", + description: "Get a random number", + params: { + min: 1, + max: 6 + }, + result: 3 + }, + { + type: "functionCall", + name: "getRandomNumber", + description: "Get a random number", + params: { + min: 1, + max: 6 + }, + result: 4 + }, + "The total result of rolling the dice twice is 3 + 4 = 7." + ] + }]; + + test("should generate valid context text", () => { + const chatWrapper = new Gemma4ChatWrapper({keepOnlyLastThought: false}); + const {contextText} = chatWrapper.generateContextState({chatHistory: conversationHistory}); + + expect(contextText).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + <|channel>thought"), + "Let me think how to respond to this.", + new SpecialTokensText(""), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "How are you?", + new SpecialTokensText(" + <|turn>model + <|channel>thought"), + "Let me think how to answer", + new SpecialTokensText(""), + "I'm good, how are you?", + ]) + `); + + const chatWrapper2 = new Gemma4ChatWrapper(); + const {contextText: contextText2} = chatWrapper2.generateContextState({ + chatHistory: conversationHistory2, + availableFunctions: functions + }); + + expect(contextText2).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText("<|tool>"), + "declaration:getRandomNumber{{"description": "Get a random number", "parameters": {"type": "object", "properties": {"min": {"type": "number"}, "max": {"type": "number"}}}}}", + new SpecialTokensText("<|tool>"), + "declaration:notifyOwner{{"description": "Send a notification to the owner, and create sub notifications", "parameters": {"$ref": "#/$defs/notification", "$defs": {"notification": {"type": "object", "properties": {"message": {"type": "string"}, "subNotifications": {"type": "array", "items": {"$ref": "#/$defs/notification"}}}}}}}}", + new SpecialTokensText("<|tool>"), + "declaration:notifyOwner2{{"description": "Send a notification to the owner, and create sub notifications", "parameters": {"$ref": "#/$defs/notification", "$defs": {"notification": {"type": "object", "properties": {"message": {"type": "string", "description": "Notification message"}, "subNotifications": {"type": "array", "description": "Sub notifications", "items": {"$ref": "#/$defs/notification"}}}}}}}}", + new SpecialTokensText("<|tool>"), + "declaration:func1{{"description": "Some function", "parameters": {"type": "object", "properties": {"message": {"type": "string", "description": "Some message", "minLength": 3, "maxLength": 10}, "words": {"type": "array", "description": "Some words", "items": {"type": "string"}, "minItems": 2, "maxItems": 5}, "headers": {"type": "object", "description": "Some headers", "additionalProperties": {"type": "string"}, "minProperties": 4, "maxProperties": 12}, "mappings": {"type": "object", "description": "Some mappings", "properties": {"a": {"type": "boolean"}, "b": {"type": "number"}, "c": {"type": ["string", "null"]}}, "additionalProperties": {"type": "string"}, "minProperties": 4, "maxProperties": 12}}}}}", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + "), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "Role a dice twice and tell me the total result", + new SpecialTokensText(" + <|turn>model + <|tool_call>call:"), + "getRandomNumber{{"min": 1, "max": 6}", + new SpecialTokensText("}response:"), + "getRandomNumber{3", + new SpecialTokensText("}<|tool_call>call:"), + "getRandomNumber{{"min": 1, "max": 6}", + new SpecialTokensText("}response:"), + "getRandomNumber{4", + new SpecialTokensText("}"), + "The total result of rolling the dice twice is 3 + 4 = 7.", + ]) + `); + + const chatWrapper3 = new Gemma4ChatWrapper(); + const {contextText: contextText3} = chatWrapper3.generateContextState({chatHistory: conversationHistory}); + const {contextText: contextText3WithOpenModelResponse} = chatWrapper3.generateContextState({ + chatHistory: [ + ...conversationHistory, + { + type: "model", + response: [] + } + ] + }); + + expect(contextText3).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + "), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "How are you?", + new SpecialTokensText(" + <|turn>model + <|channel>thought"), + "Let me think how to answer", + new SpecialTokensText(""), + "I'm good, how are you?", + ]) + `); + + expect(contextText3WithOpenModelResponse).toMatchInlineSnapshot(` + LlamaText([ + new SpecialToken("BOS"), + new SpecialTokensText("<|turn>system + <|think|>"), + "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible. + If a question does not make any sense, or is not factually coherent, explain why instead of answering something incorrectly. If you don't know the answer to a question, don't share false information.", + new SpecialTokensText(" + <|turn>user + "), + "Hi there!", + new SpecialTokensText(" + <|turn>model + "), + "Hello!", + new SpecialTokensText(" + <|turn>user + "), + "How are you?", + new SpecialTokensText(" + <|turn>model + "), + "I'm good, how are you?", + new SpecialTokensText(" + <|turn>model + "), + ]) + `); + }); +}); diff --git a/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts b/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts index 92fdc1a9..6fedeed0 100644 --- a/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts +++ b/test/standalone/chatWrappers/HarmonyChatWrapper.test.ts @@ -4,7 +4,7 @@ import {defaultChatSystemPrompt} from "../../../src/config.js"; describe("HarmonyChatWrapper", () => { - const todayDate = new Date("2025-08-05T00:00:00Z"); + const todayDate = new Date("2025-08-05T00:00:00"); const conversationHistory: ChatHistoryItem[] = [{ type: "system", diff --git a/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts b/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts index fab30cc9..d6f853de 100644 --- a/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts +++ b/test/standalone/chatWrappers/Llama3_1ChatWrapper.test.ts @@ -4,7 +4,7 @@ import {defaultChatSystemPrompt} from "../../../src/config.js"; describe("Llama3_1ChatWrapper", () => { - const todayDate = new Date("2024-07-26T00:00:00Z"); + const todayDate = new Date("2024-07-26T00:00:00"); const conversationHistory: ChatHistoryItem[] = [ ...(new Llama3_1ChatWrapper({todayDate})).generateInitialChatHistory({systemPrompt: defaultChatSystemPrompt}), { type: "user", diff --git a/test/standalone/chatWrappers/utils/jinjaTemplates.ts b/test/standalone/chatWrappers/utils/jinjaTemplates.ts index f63e3748..4f7f09e8 100644 --- a/test/standalone/chatWrappers/utils/jinjaTemplates.ts +++ b/test/standalone/chatWrappers/utils/jinjaTemplates.ts @@ -1636,3 +1636,538 @@ export const harmonyJinjaTemplate5 = ` <|start|>assistant {%- endif -%} `.slice(1, -1); + +export const gemma4JinjaTemplate1 = ` +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{ bos_token }} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {{- messages[0]['content'] | trim -}} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + + {{- '\n' -}} +{%- endif %} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {{- '<|turn>' + role + '\n' }} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- if message['tool_responses'] -%} + {#- Tool Response handling -#} + {%- for tool_response in message['tool_responses'] -%} + {{- '<|tool_response>' -}} + {%- if tool_response['response'] is mapping -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}} + {%- for key, value in tool_response['response'] | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '\n\n<|image|>\n\n' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '\n\n<|video|>\n\n' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- if not (message['tool_responses'] and not message['content']) -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} +`.slice(1); + +export const gemma4JinjaTemplate2 = ` +{%- macro format_parameters(properties, required) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'OBJECT' -%} + ,properties:{ + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + {%- elif value is mapping -%} + {{- format_parameters(value, value['required'] | default([])) -}} + {%- endif -%} + } + {%- if value['required'] -%} + ,required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + ,items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{ bos_token }} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {{- messages[0]['content'] | trim -}} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + + {{- '\n' -}} +{%- endif %} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {{- '<|turn>' + role + '\n' }} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- if message['tool_responses'] -%} + {#- Tool Response handling -#} + {%- for tool_response in message['tool_responses'] -%} + {{- '<|tool_response>' -}} + {%- if tool_response['response'] is mapping -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}} + {%- for key, value in tool_response['response'] | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '\n\n<|image|>\n\n' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '\n\n<|video|>\n\n' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- if not (message['tool_responses'] and not message['content']) -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} + {%- if not enable_thinking | default(false) -%} + {{- '<|channel>thought\n' -}} + {%- endif -%} +{%- endif -%} +`.slice(1); diff --git a/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts b/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts index f8e0e94d..e68f9024 100644 --- a/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts +++ b/test/standalone/chatWrappers/utils/resolveChatWrapper.test.ts @@ -1,10 +1,12 @@ import {describe, expect, test} from "vitest"; import { AlpacaChatWrapper, ChatMLChatWrapper, DeepSeekChatWrapper, FalconChatWrapper, FunctionaryChatWrapper, GemmaChatWrapper, - GeneralChatWrapper, Llama2ChatWrapper, Llama3_1ChatWrapper, MistralChatWrapper, QwenChatWrapper, resolveChatWrapper, HarmonyChatWrapper + Gemma4ChatWrapper, GeneralChatWrapper, Llama2ChatWrapper, Llama3_1ChatWrapper, MistralChatWrapper, QwenChatWrapper, + resolveChatWrapper, HarmonyChatWrapper } from "../../../../src/index.js"; import { - harmonyJinjaTemplate, harmonyJinjaTemplate2, harmonyJinjaTemplate3, harmonyJinjaTemplate4, harmonyJinjaTemplate5 + harmonyJinjaTemplate, harmonyJinjaTemplate2, harmonyJinjaTemplate3, harmonyJinjaTemplate4, harmonyJinjaTemplate5, + gemma4JinjaTemplate1, gemma4JinjaTemplate2 } from "./jinjaTemplates.js"; @@ -783,6 +785,32 @@ describe("resolveChatWrapper", () => { expect(chatWrapper).to.be.instanceof(GemmaChatWrapper); }); + test("should resolve to specialized Gemma4ChatWrapper", () => { + const chatWrapper = resolveChatWrapper({ + customWrapperSettings: { + jinjaTemplate: { + template: gemma4JinjaTemplate1 + } + }, + fallbackToOtherWrappersOnJinjaError: false + }); + + expect(chatWrapper).to.be.instanceof(Gemma4ChatWrapper); + }); + + test("should resolve to specialized Gemma4ChatWrapper 2", () => { + const chatWrapper = resolveChatWrapper({ + customWrapperSettings: { + jinjaTemplate: { + template: gemma4JinjaTemplate2 + } + }, + fallbackToOtherWrappersOnJinjaError: false + }); + + expect(chatWrapper).to.be.instanceof(Gemma4ChatWrapper); + }); + test("should resolve to specialized GeneralChatWrapper", () => { const chatWrapper = resolveChatWrapper({ customWrapperSettings: { diff --git a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap index 4a441965..7bf467a0 100644 --- a/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap +++ b/test/standalone/gguf/__snapshots__/ggufStandaloneParser.test.ts.snap @@ -60,6 +60,7 @@ exports[`gguf > parser > should parse remote gguf model 1`] = ` "offset": 1097564160, }, ], + "infoEndOffset": 2585504, "metadata": { "falcon": { "attention": { @@ -135,6 +136,15 @@ exports[`gguf > parser > should parse remote gguf model 1`] = ` }, }, "metadataSize": 2547826, + "sourceData": [ + { + "buffer": { + "_type": "buffer", + "length": 2585504, + }, + "type": "buffer", + }, + ], "splicedParts": 1, "tensorCount": 644, "tensorInfo": [ @@ -206,6 +216,7 @@ exports[`gguf > parser > should parse remote gguf model without tensor info 1`] "tensor_data_layout": "jploski", }, "fullTensorInfo": undefined, + "infoEndOffset": undefined, "metadata": { "falcon": { "attention": { @@ -281,6 +292,7 @@ exports[`gguf > parser > should parse remote gguf model without tensor info 1`] }, }, "metadataSize": 2547826, + "sourceData": [], "splicedParts": 1, "tensorCount": 644, "tensorInfo": undefined, diff --git a/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts b/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts index 37b44bf1..fa782fde 100644 --- a/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts +++ b/test/utils/helpers/simplifyGgufInfoForTestSnapshot.ts @@ -1,4 +1,5 @@ -import {GgufFileInfo} from "../../../src/gguf/types/GgufFileInfoTypes.js"; +import path from "path"; +import {GgufFileInfo, GgufFileInfoSourceData, GgufFileInfoSource} from "../../../src/gguf/types/GgufFileInfoTypes.js"; export function simplifyGgufInfoForTestSnapshot(ggufFileInfo: GgufFileInfo) { const ggufFileInfoCopy = structuredClone(ggufFileInfo); @@ -13,6 +14,9 @@ export function simplifyGgufInfoForTestSnapshot(ggufFileInfo: GgufFileInfo) { shortenArray(ggufFileInfoCopy.tensorInfo, 4); shortenArray(ggufFileInfoCopy.fullTensorInfo, 4); + simplifySource(ggufFileInfoCopy.source); + simplifySourceData(ggufFileInfoCopy.sourceData); + return ggufFileInfoCopy; } @@ -22,3 +26,29 @@ function shortenArray(array?: readonly any[], maxSize: number = 10) { (array as any[]).splice(maxSize); } + +function simplifySourceData(sourceData: GgufFileInfoSourceData[]) { + for (const entry of sourceData) { + if (entry.type === "buffer") + entry.buffer = { + _type: "buffer", + length: entry.buffer.length + } as any; + else if (entry.type === "path") + entry.path = { + _type: "path", + path: "/" + path.basename(entry.path) + } as any; + } +} + +function simplifySource(source?: GgufFileInfoSource) { + if (source == null) + return; + + if (source.type === "path") + source.path = { + _type: "path", + path: "/" + path.basename(source.path) + } as any; +} \ No newline at end of file diff --git a/test/utils/modelFiles.ts b/test/utils/modelFiles.ts index b8d4accc..525dfb1a 100644 --- a/test/utils/modelFiles.ts +++ b/test/utils/modelFiles.ts @@ -22,7 +22,8 @@ const supportedModels = { "nomic-embed-text-v1.5.Q4_K_M.gguf": "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q4_K_M.gguf?download=true", "bge-reranker-v2-m3-Q8_0.gguf": "https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF/resolve/main/bge-reranker-v2-m3-Q8_0.gguf?download=true", "Qwen3-0.6B-Q8_0.gguf": "https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf?download=true", - "Qwen3.5-0.8B-Q8_0.gguf": "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q8_0.gguf?download=true" + "Qwen3.5-0.8B-Q8_0.gguf": "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q8_0.gguf?download=true", + "gemma-4-E2B-it-Q4_K_M.gguf": "https://huggingface.co/unsloth/gemma-4-E2B-it-GGUF/resolve/main/gemma-4-E2B-it-Q4_K_M.gguf?download=true" } as const; export async function getModelFile(modelName: keyof typeof supportedModels) {