88#include < format>
99#include < span>
1010#include < utility>
11+ #include " ggml-cuda.h"
1112
1213#include " llama.h"
1314#include " llama-sb.h"
1415
1516constexpr int MAX_REPEAT = 5 ;
1617
18+ static bool read_vram (size_t &used, size_t &total) {
19+ size_t free = 0 ;
20+ total = 0 ;
21+ #ifdef GGML_USE_CUDA
22+ ggml_backend_cuda_get_device_memory (0 , &free, &total);
23+ if (total > 0 ) {
24+ used = total - free;
25+ return true ;
26+ }
27+ #endif
28+ return false ;
29+ }
30+
1731LlamaIter::LlamaIter () :
1832 _llama(nullptr ),
1933 _repetition_count(0 ),
@@ -45,6 +59,7 @@ Llama::Llama() :
4559 _top_k(0 ),
4660 _max_tokens(0 ),
4761 _log_level(GGML_LOG_LEVEL_CONT),
62+ _n_gpu_layers(0 ),
4863 _n_past(0 ),
4964 _is_gemma4(false ),
5065 _seed(LLAMA_DEFAULT_SEED) {
@@ -78,6 +93,7 @@ Llama::Llama(Llama &&other) noexcept
7893 , _top_k(other._top_k)
7994 , _max_tokens(other._max_tokens)
8095 , _log_level(other._log_level)
96+ , _n_gpu_layers(other._n_gpu_layers)
8197 , _n_past(other._n_past)
8298 , _is_gemma4(other._is_gemma4)
8399 , _seed(other._seed) {
@@ -128,6 +144,7 @@ bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layer
128144 }
129145
130146 _log_level = log_level;
147+ _n_gpu_layers = n_gpu_layers;
131148 _model = llama_model_load_from_file (model_path.c_str (), mparams);
132149 if (!_model) {
133150 _last_error = " Failed to load model" ;
@@ -141,8 +158,8 @@ bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layer
141158 cparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
142159
143160 // or Q4_0 for more aggressive saving
144- cparams.type_k = GGML_TYPE_Q8_0 ;
145- cparams.type_v = GGML_TYPE_Q8_0 ;
161+ cparams.type_k = GGML_TYPE_Q4_0 ;
162+ cparams.type_v = GGML_TYPE_Q4_0 ;
146163
147164 // keep KV cache on GPU
148165 cparams.offload_kqv = true ;
@@ -331,7 +348,8 @@ bool Llama::add_message(LlamaIter &iter, const string &role, const string &conte
331348 llama_batch batch = llama_batch_get_one (prompt_tokens.data () + i, batch_size);
332349 int result = llama_decode (_ctx, batch);
333350 if (result != 0 ) {
334- _last_error = std::format (" Failed to decode batch. position:{} error:{}" , i, result);
351+ _last_error = std::format (" Failed to decode batch. position:{} error:{} [size:{}, past:{}]" ,
352+ i, result, prompt_tokens.size (), _n_past);
335353 return false ;
336354 }
337355 }
@@ -506,3 +524,44 @@ string Llama::all(LlamaIter &iter) {
506524
507525 return out;
508526}
527+
528+ LlamaMemoryInfo Llama::memory_info () {
529+ LlamaMemoryInfo info = {};
530+
531+ // KV cache usage
532+ llama_memory_t mem = llama_get_memory (_ctx);
533+ llama_pos pos_max = llama_memory_seq_pos_max (mem, 0 );
534+ int n_ctx = llama_n_ctx (_ctx);
535+ info.kv_total = n_ctx;
536+ info.kv_used = (pos_max < 0 ) ? 0 : (int )pos_max + 1 ;
537+ info.kv_percent = 100 .0f * info.kv_used / info.kv_total ;
538+
539+ // Model layers
540+ info.n_layers_total = llama_model_n_layer (_model);
541+ info.n_layers_gpu = _n_gpu_layers;
542+ info.n_layers_cpu = info.n_layers_total - info.n_layers_gpu ;
543+
544+ // ram
545+ if (read_vram (info.vram_used , info.vram_total )) {
546+ info.vram_percent = 100 .0f * info.vram_used / info.vram_total ;
547+ }
548+
549+ // Advice
550+ ostringstream advice;
551+ if (info.n_layers_cpu > 0 ) {
552+ advice << " CPU offload active (" << info.n_layers_cpu
553+ << " layers on CPU) - increase n_gpu_layers if VRAM allows. " ;
554+ }
555+ if (info.vram_percent > 90 .0f ) {
556+ advice << " VRAM >90% - reduce n_ctx or use Q4_0 KV cache. " ;
557+ } else if (info.vram_percent < 60 .0f && info.n_layers_cpu > 0 ) {
558+ advice << " VRAM headroom available - try adding more GPU layers. " ;
559+ }
560+ if (info.kv_percent > 80 .0f ) {
561+ advice << " Context >80% full - consider calling clear_history(). " ;
562+ }
563+ info.advice = advice.str ();
564+
565+ return info;
566+ }
567+
0 commit comments