Skip to content

Commit 66d874c

Browse files
author
Chris Warren-Smith
committed
LLAMA: implemented mem_info command
1 parent 61373ce commit 66d874c

4 files changed

Lines changed: 118 additions & 3 deletions

File tree

llama/llama-sb.cpp

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,26 @@
88
#include <format>
99
#include <span>
1010
#include <utility>
11+
#include "ggml-cuda.h"
1112

1213
#include "llama.h"
1314
#include "llama-sb.h"
1415

1516
constexpr int MAX_REPEAT = 5;
1617

18+
static bool read_vram(size_t &used, size_t &total) {
19+
size_t free = 0;
20+
total = 0;
21+
#ifdef GGML_USE_CUDA
22+
ggml_backend_cuda_get_device_memory(0, &free, &total);
23+
if (total > 0) {
24+
used = total - free;
25+
return true;
26+
}
27+
#endif
28+
return false;
29+
}
30+
1731
LlamaIter::LlamaIter() :
1832
_llama(nullptr),
1933
_repetition_count(0),
@@ -45,6 +59,7 @@ Llama::Llama() :
4559
_top_k(0),
4660
_max_tokens(0),
4761
_log_level(GGML_LOG_LEVEL_CONT),
62+
_n_gpu_layers(0),
4863
_n_past(0),
4964
_is_gemma4(false),
5065
_seed(LLAMA_DEFAULT_SEED) {
@@ -78,6 +93,7 @@ Llama::Llama(Llama &&other) noexcept
7893
, _top_k(other._top_k)
7994
, _max_tokens(other._max_tokens)
8095
, _log_level(other._log_level)
96+
, _n_gpu_layers(other._n_gpu_layers)
8197
, _n_past(other._n_past)
8298
, _is_gemma4(other._is_gemma4)
8399
, _seed(other._seed) {
@@ -128,6 +144,7 @@ bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layer
128144
}
129145

130146
_log_level = log_level;
147+
_n_gpu_layers = n_gpu_layers;
131148
_model = llama_model_load_from_file(model_path.c_str(), mparams);
132149
if (!_model) {
133150
_last_error = "Failed to load model";
@@ -141,8 +158,8 @@ bool Llama::construct(string model_path, int n_ctx, int n_batch, int n_gpu_layer
141158
cparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
142159

143160
// or Q4_0 for more aggressive saving
144-
cparams.type_k = GGML_TYPE_Q8_0;
145-
cparams.type_v = GGML_TYPE_Q8_0;
161+
cparams.type_k = GGML_TYPE_Q4_0;
162+
cparams.type_v = GGML_TYPE_Q4_0;
146163

147164
// keep KV cache on GPU
148165
cparams.offload_kqv = true;
@@ -331,7 +348,8 @@ bool Llama::add_message(LlamaIter &iter, const string &role, const string &conte
331348
llama_batch batch = llama_batch_get_one(prompt_tokens.data() + i, batch_size);
332349
int result = llama_decode(_ctx, batch);
333350
if (result != 0) {
334-
_last_error = std::format("Failed to decode batch. position:{} error:{}", i, result);
351+
_last_error = std::format("Failed to decode batch. position:{} error:{} [size:{}, past:{}]",
352+
i, result, prompt_tokens.size(), _n_past);
335353
return false;
336354
}
337355
}
@@ -506,3 +524,44 @@ string Llama::all(LlamaIter &iter) {
506524

507525
return out;
508526
}
527+
528+
LlamaMemoryInfo Llama::memory_info() {
529+
LlamaMemoryInfo info = {};
530+
531+
// KV cache usage
532+
llama_memory_t mem = llama_get_memory(_ctx);
533+
llama_pos pos_max = llama_memory_seq_pos_max(mem, 0);
534+
int n_ctx = llama_n_ctx(_ctx);
535+
info.kv_total = n_ctx;
536+
info.kv_used = (pos_max < 0) ? 0 : (int)pos_max + 1;
537+
info.kv_percent = 100.0f * info.kv_used / info.kv_total;
538+
539+
// Model layers
540+
info.n_layers_total = llama_model_n_layer(_model);
541+
info.n_layers_gpu = _n_gpu_layers;
542+
info.n_layers_cpu = info.n_layers_total - info.n_layers_gpu;
543+
544+
// ram
545+
if (read_vram(info.vram_used, info.vram_total)) {
546+
info.vram_percent = 100.0f * info.vram_used / info.vram_total;
547+
}
548+
549+
// Advice
550+
ostringstream advice;
551+
if (info.n_layers_cpu > 0) {
552+
advice << "CPU offload active (" << info.n_layers_cpu
553+
<< " layers on CPU) - increase n_gpu_layers if VRAM allows. ";
554+
}
555+
if (info.vram_percent > 90.0f) {
556+
advice << "VRAM >90% - reduce n_ctx or use Q4_0 KV cache. ";
557+
} else if (info.vram_percent < 60.0f && info.n_layers_cpu > 0) {
558+
advice << "VRAM headroom available - try adding more GPU layers. ";
559+
}
560+
if (info.kv_percent > 80.0f) {
561+
advice << "Context >80% full - consider calling clear_history(). ";
562+
}
563+
info.advice = advice.str();
564+
565+
return info;
566+
}
567+

llama/llama-sb.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,26 @@
1414

1515
using namespace std;
1616

17+
struct LlamaMemoryInfo {
18+
// KV cache
19+
int kv_used; // slots currently used
20+
int kv_total; // total slots (== n_ctx)
21+
float kv_percent; // kv_used / kv_total
22+
23+
// GPU VRAM (via ggml backend)
24+
size_t vram_used; // bytes
25+
size_t vram_total; // bytes
26+
float vram_percent;
27+
28+
// Model layers
29+
int n_layers_total; // total model layers
30+
int n_layers_gpu; // layers offloaded to GPU
31+
int n_layers_cpu; // layers on CPU
32+
33+
// Advice
34+
string advice;
35+
};
36+
1737
struct Llama;
1838

1939
struct LlamaIter {
@@ -75,6 +95,9 @@ struct Llama {
7595
void set_log_level(int level) { _log_level = level; }
7696
void reset();
7797

98+
// memory info
99+
LlamaMemoryInfo memory_info();
100+
78101
private:
79102
bool ends_with_sentence_boundary(const string &out);
80103
bool configure_sampler();
@@ -102,6 +125,7 @@ struct Llama {
102125
int _top_k;
103126
int _max_tokens;
104127
int _log_level;
128+
int _n_gpu_layers;
105129
int _n_past;
106130
bool _is_gemma4;
107131
unsigned int _seed;

llama/main.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,35 @@ static int cmd_llama_add_message(var_s *self, int argc, slib_par_t *arg, var_s *
431431
return result;
432432
}
433433

434+
//
435+
// print llama.mem_info()
436+
//
437+
static int cmd_llama_mem_info(var_s *self, int argc, slib_par_t *arg, var_s *retval) {
438+
int result = 0;
439+
if (argc != 0) {
440+
error(retval, "llama.mem_info", 0, 0);
441+
} else {
442+
int id = get_llama_class_id(self, retval);
443+
if (id != -1) {
444+
Llama &llama = g_llama.at(id);
445+
auto mem_info = llama.memory_info();
446+
map_init(retval);
447+
v_setint(map_add_var(retval, "kv_used", 0), mem_info.kv_used);
448+
v_setint(map_add_var(retval, "kv_total", 0), mem_info.kv_total);
449+
v_setreal(map_add_var(retval, "kv_percent", 0), mem_info.kv_percent);
450+
v_setint(map_add_var(retval, "vram_used", 0), mem_info.vram_used);
451+
v_setint(map_add_var(retval, "vram_total", 0), mem_info.vram_total);
452+
v_setreal(map_add_var(retval, "vram_percent", 0), mem_info.vram_percent);
453+
v_setint(map_add_var(retval, "n_layers_cpu", 0), mem_info.n_layers_cpu);
454+
v_setint(map_add_var(retval, "n_layers_gpu", 0), mem_info.n_layers_gpu);
455+
v_setint(map_add_var(retval, "n_layers_total", 0), mem_info.n_layers_total);
456+
v_setstr(map_add_var(retval, "advice", 0), mem_info.advice.c_str());
457+
result = 1;
458+
}
459+
}
460+
return result;
461+
}
462+
434463
static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
435464
int result;
436465
auto model = expand_path(get_param_str(argc, params, 0, ""));
@@ -456,6 +485,7 @@ static int cmd_create_llama(int argc, slib_par_t *params, var_t *retval) {
456485
v_create_callback(retval, "set_top_p", cmd_llama_set_top_p);
457486
v_create_callback(retval, "set_grammar", cmd_llama_set_grammar);
458487
v_create_callback(retval, "set_seed", cmd_llama_set_seed);
488+
v_create_callback(retval, "mem_info", cmd_llama_mem_info);
459489
result = 1;
460490
} else {
461491
error(retval, llama.last_error());

llama/samples/nitro.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ Available commands:
5353
- TOOL:LIST `[directory_path. items enclosed in square brackets (`[...]`) represent directories within the file listing output]`
5454
- TOOL:READ `[file_path]`
5555
- TOOL:WRITE `[file_path]`
56+
- TOOL:EXISTS `[file_path]`
57+
- TOOL:PERMISSION `[Request user permission before overwriting a file]`
5658
- TOOL:DATE `[Returns the current date as string with format “DD/MM/YYYY”]`
5759
- TOOL:TIME `[Returns the time in “HH:MM:SS” format]`
5860
- TOOL:RND [Returns a random number betweem 0 and 1]`

0 commit comments

Comments
 (0)