UbiquitousLearning · Sp0tless · May 30, 2026 · coderabbitai · Nov 12, 2025
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -5,9 +5,12 @@
 #add_subdirectory(llama)
 add_subdirectory(minicpm_o)
 add_subdirectory(minicpm4)
-#add_subdirectory(qwen3)
-#add_subdirectory(qwen3_service)
-#add_subdirectory(deepseek_ocr)
+add_subdirectory(qwen3)
+add_subdirectory(qwen3_service)
+add_subdirectory(deepseek_ocr)
+add_subdirectory(smollm3_3B)
+add_subdirectory(internlm2_5)
+
 if(MLLM_BUILD_QNN_BACKEND)
   add_subdirectory(qwen_npu)
 endif()

diff --git a/examples/internlm2_5/CMakeLists.txt b/examples/internlm2_5/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_executable(mllm-internlm2_5-chat-runner main.cpp)
+target_link_libraries(mllm-internlm2_5-chat-runner PRIVATE MllmRT MllmCPUBackend)
+target_include_directories(mllm-internlm2_5-chat-runner PRIVATE ${MLLM_INCLUDE_DIR})
diff --git a/examples/internlm2_5/config_1.8B.json b/examples/internlm2_5/config_1.8B.json
@@ -0,0 +1,29 @@
+{
+    "architectures": [
+        "InternLM2ForCausalLM"
+    ],
+    "bias": false,
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "max_position_embeddings": 32768,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 8,
+    "pad_token_id": 2,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "factor": 2.0,
+        "type": "dynamic"
+    },
+    "rope_theta": 1000000,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.34.0",
+    "use_cache": true,
+    "vocab_size": 92544,
+    "linear_impl_type": "Default"
+}
diff --git a/examples/internlm2_5/main.cpp b/examples/internlm2_5/main.cpp
@@ -0,0 +1,66 @@
+#include <iostream>
+#include <fmt/core.h>
+#include <chrono>
+#include <mllm/mllm.hpp>
+#include <mllm/models/internlm2/modeling_internlm2.hpp>
+#include <mllm/models/internlm2/tokenization_internlm2.hpp>
+
+using mllm::Argparse;
+using Clock = std::chrono::high_resolution_clock;
+
+MLLM_MAIN({
+  auto& help = Argparse::add<bool>("-h|--help").help("Show help message");
+  auto& model_path = Argparse::add<std::string>("-m|--model_path").help("Model path").required(true);
+  auto& model_version = Argparse::add<std::string>("-mv|--model_version").help("Model version").required(true);
+  auto& tokenizer_path = Argparse::add<std::string>("-t|--tokenizer_path").help("Tokenizer JSON path").required(true);
+  auto& config_path = Argparse::add<std::string>("-c|--config_path").help("Config path").required(true);
+  Argparse::parse(argc, argv);
+
+  mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1;
+  if (model_version.get() == "v2") file_version = mllm::ModelFileVersion::kV2;
+  if (help.isSet()) { Argparse::printHelp(); mllm::shutdownContext(); return 0; }
+
+  auto cfg = mllm::models::internlm2::InternLM2Config(config_path.get());
+  auto tokenizer = mllm::models::internlm2::InternLM2Tokenizer(tokenizer_path.get());
+  auto model = mllm::models::internlm2::InternLM2ForCausalLM(cfg);
+  auto params = mllm::load(model_path.get(), file_version);
+  model.load(params);
+
+  fmt::print("\n{:*^60}\n", " InternLM2.5 1.5B CLI ");
+
+  std::string prompt_text;
+  fmt::print("💬 Prompt text (or 'exit/quit'): ");
+  std::getline(std::cin, prompt_text);
+  if (!(prompt_text == "exit" || prompt_text == "quit")) {
+    fmt::print("🔄 Processing...\n");
+    mllm::models::internlm2::InternLM2Message prompt{prompt_text};
+    auto inputs = tokenizer.convertMessage(prompt);
+
+    auto t0 = Clock::now();
+    fmt::print("\n🤖 Response: ");
+
+    int tok_count = 0;
+    auto t_first = t0;
+    for (auto& step : model.chat(inputs)) {
+      if (tok_count == 0) t_first = Clock::now();
+      auto token = tokenizer.detokenize(step.cur_token_id);
+      std::wcout << token << std::flush;
+      tok_count++;
+      if (tok_count >= 30) break;
+    }
+
+    auto t_end = Clock::now();
+    auto prefill_ms = std::chrono::duration<double, std::milli>(t_first - t0).count();
+    auto total_s = std::chrono::duration<double>(t_end - t0).count();
+
+    fmt::print("\n{}\n", std::string(60, '-'));
+    fmt::print("⏱ Prefill: {:.0f}ms | ", prefill_ms);
+    if (tok_count > 1) {
+      auto decode_s = std::max(0.001, total_s - prefill_ms/1000.0);
+      fmt::print("Decode: {:.1f} tok/s ({} tokens in {:.1f}s)\n",
+                 (tok_count-1)/decode_s, tok_count, total_s);
+    }
+  }
+  mllm::print("\n");
+  mllm::memoryReport();
+})
diff --git a/mllm/models/internlm2/configuration_internlm2.hpp b/mllm/models/internlm2/configuration_internlm2.hpp
@@ -0,0 +1,82 @@
+// Copyright (c) MLLM Team.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+#include "mllm/core/aops/LinearOp.hpp"
+#include "mllm/engine/ConfigFile.hpp"
+
+namespace mllm::models::internlm2 {
+
+struct InternLM2Config : protected ConfigFile {
+  InternLM2Config() = default;
+
+  explicit InternLM2Config(const std::string& file_path) : ConfigFile(file_path) {
+    auto& json = data();
+
+    if (json.contains("bias")) { bias = json["bias"].get<bool>(); }
+    if (json.contains("hidden_size")) { hidden_size = json["hidden_size"].get<int32_t>(); }
+    if (json.contains("intermediate_size")) { intermediate_size = json["intermediate_size"].get<int32_t>(); }
+    if (json.contains("num_hidden_layers")) { num_hidden_layers = json["num_hidden_layers"].get<int32_t>(); }
+    if (json.contains("num_attention_heads")) { num_attention_heads = json["num_attention_heads"].get<int32_t>(); }
+    if (json.contains("num_key_value_heads")) {
+      num_key_value_heads = json["num_key_value_heads"].get<int32_t>();
+    } else {
+      num_key_value_heads = num_attention_heads;
+    }
+    if (json.contains("max_position_embeddings")) { max_position_embeddings = json["max_position_embeddings"].get<int32_t>(); }
+    if (json.contains("rms_norm_eps")) { rms_norm_eps = json["rms_norm_eps"].get<float>(); }
+    if (json.contains("vocab_size")) { vocab_size = json["vocab_size"].get<int32_t>(); }
+    if (json.contains("rope_theta")) { rope_theta = json["rope_theta"].get<float>(); }
+    if (json.contains("tie_word_embeddings")) { tie_word_embeddings = json["tie_word_embeddings"].get<bool>(); }
+    if (json.contains("use_cache")) { use_cache = json["use_cache"].get<bool>(); }
+    if (json.contains("pad_token_id")) { pad_token_id = json["pad_token_id"].get<int32_t>(); }
+    if (json.contains("bos_token_id")) { bos_token_id = json["bos_token_id"].get<int32_t>(); }
+    if (json.contains("eos_token_id")) { eos_token_id = json["eos_token_id"].get<int32_t>(); }
+    if (json.contains("initializer_range")) { initializer_range = json["initializer_range"].get<float>(); }
+
+    if (json.contains("rope_scaling")) {
+      const auto& scaling = json["rope_scaling"];
+      if (scaling.contains("type")) { rope_scaling_type = scaling["type"].get<std::string>(); }
+      if (scaling.contains("factor")) { rope_scaling_factor = scaling["factor"].get<float>(); }
+    }
+
+    if (json.contains("linear_impl_type")) {
+      linear_impl_type = aops::str2LinearImplTypes(json["linear_impl_type"].get<std::string>());
+    }
+
+    head_dim = hidden_size / num_attention_heads;
+        if (json.contains("max_cache_length")) { max_cache_length = json["max_cache_length"].get<int32_t>(); } else { max_cache_length = max_position_embeddings; }
+    end_of_text_token_id = static_cast<int32_t>(eos_token_id);
+  }
-    head_dim = hidden_size / num_attention_heads;
-    max_cache_length = max_position_embeddings;
-    end_of_text_token_id = static_cast<int32_t>(eos_token_id);
-  }
+    if (num_attention_heads <= 0 || hidden_size % num_attention_heads != 0) {
+      throw std::invalid_argument(
+          fmt::format("hidden_size ({}) must be divisible by num_attention_heads ({})",
+                      hidden_size, num_attention_heads));
+    }
+    head_dim = hidden_size / num_attention_heads;
+    max_cache_length = max_position_embeddings;
+    end_of_text_token_id = static_cast<int32_t>(eos_token_id);
+  }
-    head_dim = hidden_size / num_attention_heads;
-    max_cache_length = max_position_embeddings;
-    end_of_text_token_id = static_cast<int32_t>(eos_token_id);
-  }
+    if (num_attention_heads <= 0 || hidden_size % num_attention_heads != 0) {
+      throw std::invalid_argument(
+          fmt::format("hidden_size ({}) must be divisible by num_attention_heads ({})",
+                      hidden_size, num_attention_heads));
+    }
+    head_dim = hidden_size / num_attention_heads;
+    max_cache_length = max_position_embeddings;
+    end_of_text_token_id = static_cast<int32_t>(eos_token_id);
+  }
+
+  bool bias = false;
+  int32_t hidden_size = 4096;
+  int32_t intermediate_size = 11008;
+  int32_t num_hidden_layers = 32;
+  int32_t num_attention_heads = 32;
+  int32_t num_key_value_heads = 32;
+  int32_t max_position_embeddings = 2048;
+  int32_t max_cache_length = 2048;
+  int32_t head_dim = 128;
+  int32_t vocab_size = 32000;
+  float rms_norm_eps = 1e-6f;
+  float rope_theta = 10000.0f;
+  float rope_scaling_factor = 1.0f;
+  std::string rope_scaling_type;
+
+  float initializer_range = 0.02f;
+  bool tie_word_embeddings = false;
+  bool use_cache = true;
+
+  int32_t pad_token_id = 0;
+  int32_t bos_token_id = 1;
+  int32_t eos_token_id = 2;
+  int32_t end_of_text_token_id = 2;
+
+  aops::LinearImplTypes linear_impl_type = aops::LinearImplTypes::kDefault;
+};
+
+}  // namespace mllm::models::internlm2