From 7a06dbf00889d8376af63df7225392c9d407fa8a Mon Sep 17 00:00:00 2001
From: Egor Timofeev <etimofeev@together.ai>
Date: Fri, 6 Mar 2026 13:52:44 +0100
Subject: [PATCH 1/2] Add position ids

---
 examples/tokenize_data.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/examples/tokenize_data.py b/examples/tokenize_data.py
index 327f9cd1..5c21b905 100644
--- a/examples/tokenize_data.py
+++ b/examples/tokenize_data.py
@@ -76,30 +76,40 @@ def pack_sequences(
         ['▁toys', '▁.', '</s>', '<s>', '▁but', '▁just', '▁one', '▁look']
     """
     packed_sequences = []
+    packed_position_ids = []
     buffer = []
+    position_buffer = []
 
     for input_ids in batch["input_ids"]:
-        # Add the current sequence to the buffer
+        # Position IDs reset to 0 at the start of each sub-sequence; EOS gets the next position.
+        seq_positions = list(range(len(input_ids) + 1))
         buffer.extend(input_ids)
         buffer.append(eos_token_id)  # Add EOS at the end of each sequence
+        position_buffer.extend(seq_positions)
 
         # Check if buffer needs to be split into chunks
         while len(buffer) > max_seq_len:
             # Take a full chunk from the buffer and append it to packed_sequences
             packed_sequences.append(buffer[:max_seq_len])
+            packed_position_ids.append(position_buffer[:max_seq_len])
             # Remove the processed chunk from the buffer
             buffer = buffer[max_seq_len:]
+            position_buffer = position_buffer[max_seq_len:]
 
     # Add the last buffer if it's exactly chunk_size
     if len(buffer) == max_seq_len:
         packed_sequences.append(buffer)
+        packed_position_ids.append(position_buffer)
     elif len(buffer) > cutoff_size:
         # if the buffer is larger than the cutoff size, pad it to the chunk_size
         # if not, we do not include in the packed_sequences
-        buffer.extend([pad_token_id] * (max_seq_len - len(buffer)))
+        padding_length = max_seq_len - len(buffer)
+        buffer.extend([pad_token_id] * padding_length)
+        position_buffer.extend([0] * padding_length)
         packed_sequences.append(buffer)
+        packed_position_ids.append(position_buffer)
 
-    output = {"input_ids": packed_sequences}
+    output = {"input_ids": packed_sequences, "position_ids": packed_position_ids}
     if add_labels:
         output["labels"] = [
             [
@@ -109,7 +119,6 @@ def pack_sequences(
             for example in output["input_ids"]
         ]
 
-    # mask attention for padding tokens, a better version would also mask cross-sequence dependencies
     output["attention_mask"] = [
         [0 if token_id == pad_token_id else 1 for token_id in example]
         for example in output["input_ids"]

From 98b692417f954e0aea677c1f93a7af3be506f9eb Mon Sep 17 00:00:00 2001
From: Egor Timofeev <etimofeev@together.ai>
Date: Mon, 9 Mar 2026 16:43:29 +0100
Subject: [PATCH 2/2] Simplify

---
 examples/tokenize_data.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/examples/tokenize_data.py b/examples/tokenize_data.py
index 5c21b905..314b7eaf 100644
--- a/examples/tokenize_data.py
+++ b/examples/tokenize_data.py
@@ -81,20 +81,29 @@ def pack_sequences(
     position_buffer = []
 
     for input_ids in batch["input_ids"]:
-        # Position IDs reset to 0 at the start of each sub-sequence; EOS gets the next position.
-        seq_positions = list(range(len(input_ids) + 1))
-        buffer.extend(input_ids)
-        buffer.append(eos_token_id)  # Add EOS at the end of each sequence
+        # Truncate sequences that individually exceed max_seq_len (including EOS token).
+        seq_with_eos = (input_ids + [eos_token_id])[:max_seq_len]
+        # Position IDs reset to 0 at the start of each sub-sequence.
+        seq_positions = list(range(len(seq_with_eos)))
+
+        # If adding this sequence would overflow, flush the current buffer first.
+        # This ensures every chunk starts at a sequence boundary (position_ids[0] == 0).
+        if buffer and len(buffer) + len(seq_with_eos) > max_seq_len:
+            padding_length = max_seq_len - len(buffer)
+            packed_sequences.append(buffer + [pad_token_id] * padding_length)
+            packed_position_ids.append(position_buffer + [0] * padding_length)
+            buffer = []
+            position_buffer = []
+
+        buffer.extend(seq_with_eos)
         position_buffer.extend(seq_positions)
 
-        # Check if buffer needs to be split into chunks
-        while len(buffer) > max_seq_len:
-            # Take a full chunk from the buffer and append it to packed_sequences
-            packed_sequences.append(buffer[:max_seq_len])
-            packed_position_ids.append(position_buffer[:max_seq_len])
-            # Remove the processed chunk from the buffer
-            buffer = buffer[max_seq_len:]
-            position_buffer = position_buffer[max_seq_len:]
+        # Flush immediately if exactly full (no padding needed).
+        if len(buffer) == max_seq_len:
+            packed_sequences.append(buffer)
+            packed_position_ids.append(position_buffer)
+            buffer = []
+            position_buffer = []
 
     # Add the last buffer if it's exactly chunk_size
     if len(buffer) == max_seq_len: