From 37684b5e3a1eb9dc7b8268a8fc8b02bfedddac09 Mon Sep 17 00:00:00 2001
From: Paulo Vitor <paulovitor_pe@hotmail.com>
Date: Mon, 6 Apr 2026 16:36:17 -0300
Subject: [PATCH 1/2] fix(vllm): Enhance VLLMModel context size handling for
 batch inputs

---
 src/lighteval/models/vllm/vllm_model.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py
index 3100c56b7..0c0d313f7 100644
--- a/src/lighteval/models/vllm/vllm_model.py
+++ b/src/lighteval/models/vllm/vllm_model.py
@@ -355,7 +355,10 @@ def _greedy_until(
             # The choice we go for here is to avoid truncating the prompt if we can, since it
             # should have been managed by the prompt creator/few shot manager if requested by the user.
             inputs = tokenized["input_ids"]
-            context_size = len(inputs[0])
+            # Use the longest prompt in the batch (worst case) for truncation decisions,
+            # not only the first item — otherwise shorter first samples can skip truncation
+            # while longer later samples still exceed max_length.
+            context_size = max((len(input_ids) for input_ids in inputs), default=0)
 
             # left truncate the inputs to the maximum length
             if self.max_length is None:
@@ -365,7 +368,8 @@ def _greedy_until(
             elif max_new_tokens is not None:
                 if context_size + max_new_tokens > self.max_length:
                     logger.warning(
-                        f"{context_size + max_new_tokens=} which is greater than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
+                        f"max_prompt_len_in_batch={context_size}, {context_size + max_new_tokens=} which is greater than {self.max_length=}. "
+                        f"Truncating each context to {self.max_length - max_new_tokens} tokens (based on longest sample in batch)."
                     )
                     context_size = self.max_length - max_new_tokens
                     if context_size < 0:
@@ -377,7 +381,8 @@ def _greedy_until(
             else:
                 if context_size > self.max_length:
                     logger.warning(
-                        f"{context_size=} which is greater than {self.max_length=}. Truncating context to {self.max_length} tokens."
+                        f"max_prompt_len_in_batch={context_size} which is greater than {self.max_length=}. "
+                        f"Truncating each context to {self.max_length} tokens (based on longest sample in batch)."
                     )
                     context_size = self.max_length
                     inputs = [input[-context_size:] for input in inputs]

From af58b74cace69f1839663fc65f739e76f06667e9 Mon Sep 17 00:00:00 2001
From: Paulo Vitor <paulovitor_pe@hotmail.com>
Date: Thu, 7 May 2026 11:09:50 -0300
Subject: [PATCH 2/2] fix: empty return

---
 src/lighteval/utils/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/lighteval/utils/utils.py b/src/lighteval/utils/utils.py
index 3ab5976d8..c7ded9b14 100644
--- a/src/lighteval/utils/utils.py
+++ b/src/lighteval/utils/utils.py
@@ -305,6 +305,9 @@ def remove_reasoning_tags(text: str, tag_pairs: list[tuple[str, str]]) -> str:
     """
     result = text
 
+    if result is None:
+        return ""
+
     for start_tag, end_tag in tag_pairs:
         while start_tag in result and end_tag in result:
             start = result.find(start_tag)