fix: enable addLeadingSpace for SentencePiece unigram models

dndungu · dndungu · commit 0d1b1028bdd2 · 2026-03-26T11:20:42.000-07:00
SetSentencePiece(true) now sets addLeadingSpace=true as a persistent
field on BPETokenizer, matching llama.cpp / SentencePiece default
behavior. Previously addLeadingSpace was only a parameter passed
through the call chain — making it a field ensures the first word
always gets the ▁ prefix prepended, so tokens like ▁What are found
by the Viterbi DP instead of falling back to character-level tokens.

Also adds SetAddLeadingSpace() for GGUF models that override the
default via tokenizer.ggml.add_space_prefix metadata.
diff --git a/bpe.go b/bpe.go
@@ -37,6 +37,10 @@ type BPETokenizer struct {
 	// sentencePiece enables SentencePiece-style pre-tokenization where spaces
 	// are replaced with ▁ (U+2581) and words are split at ▁ boundaries.
 	sentencePiece bool
+	// addLeadingSpace prepends ▁ to the first word during SentencePiece
+	// pre-tokenization. This is true by default for SentencePiece models,
+	// matching llama.cpp / SentencePiece behavior.
+	addLeadingSpace bool
 	// specialTokens maps special token strings to their IDs for exact matching
 	// during encoding (e.g., "<start_of_turn>" -> 105).
 	specialTokens map[string]int
@@ -92,16 +96,17 @@ func (t *BPETokenizer) Encode(text string) ([]int, error) {
 }
 
 // encodeSegment tokenizes a text segment that contains no special tokens.
-// addLeadingSpace controls whether SentencePiece mode prepends ▁ to the text.
-func (t *BPETokenizer) encodeSegment(text string, addLeadingSpace bool) ([]int, error) {
+// isFirstSegment indicates this is the first text segment (before any special
+// tokens), which determines whether the addLeadingSpace field applies.
+func (t *BPETokenizer) encodeSegment(text string, isFirstSegment bool) ([]int, error) {
 	if text == "" {
 		return nil, nil
 	}
 	var words []string
 	if t.byteLevelBPE {
 		words = t.byteLevelPreTokenize(text)
 	} else if t.sentencePiece {
-		words = t.sentencePiecePreTokenize(text, addLeadingSpace)
+		words = t.sentencePiecePreTokenize(text, isFirstSegment && t.addLeadingSpace)
 	} else {
 		words = strings.Fields(text)
 	}
@@ -276,6 +281,15 @@ func (t *BPETokenizer) SpecialTokens() SpecialTokens {
 // are replaced with ▁ (U+2581) and the text is split at ▁ boundaries.
 func (t *BPETokenizer) SetSentencePiece(enabled bool) {
 	t.sentencePiece = enabled
+	t.addLeadingSpace = enabled
+}
+
+// SetAddLeadingSpace controls whether SentencePiece mode prepends ▁ to the
+// first word. By default this is set to true when SetSentencePiece is called,
+// matching llama.cpp / SentencePiece behavior. GGUF models may override this
+// via the tokenizer.ggml.add_space_prefix metadata key.
+func (t *BPETokenizer) SetAddLeadingSpace(enabled bool) {
+	t.addLeadingSpace = enabled
 }
 
 // SetSpecialTokenStrings registers token strings that should be matched
diff --git a/bpe_test.go b/bpe_test.go
@@ -938,6 +938,79 @@ func TestSentencePieceUnigram_ByteFallbackStillWorksForUnknownChars(t *testing.T
 	}
 }
 
+func TestSentencePieceUnigram_AddLeadingSpaceDefault(t *testing.T) {
+	// Regression test: SetSentencePiece(true) must enable addLeadingSpace so
+	// the Viterbi receives "▁What" (7 bytes) as input rather than "What" (4 bytes).
+	// Without addLeadingSpace, the ▁ prefix is missing and the Viterbi produces
+	// byte-level or character-level fallback tokens instead of matching "▁What".
+	vocab := map[string]int{
+		"<unk>":       0,
+		"<s>":        1,
+		"</s>":       2,
+		"\u2581What":    3,
+		"\u2581is":      4,
+		"\u2581the":     5,
+		"\u2581capital": 6,
+		"\u2581of":      7,
+		"\u2581France":  8,
+		"?":          9,
+		"W":          10,
+		"h":          11,
+		"a":          12,
+		"t":          13,
+	}
+	// Add byte fallback tokens.
+	nextID := 14
+	for b := 0; b < 256; b++ {
+		tok := fmt.Sprintf("<0x%02X>", b)
+		vocab[tok] = nextID
+		nextID++
+	}
+
+	scores := make([]float32, nextID)
+	scores[0] = -100
+	scores[1] = -100
+	scores[2] = -100
+	scores[3] = -2.0  // ▁What
+	scores[4] = -2.0  // ▁is
+	scores[5] = -2.0  // ▁the
+	scores[6] = -2.0  // ▁capital
+	scores[7] = -2.0  // ▁of
+	scores[8] = -2.0  // ▁France
+	scores[9] = -3.0  // ?
+	scores[10] = -5.0 // W
+	scores[11] = -5.0 // h
+	scores[12] = -5.0 // a
+	scores[13] = -5.0 // t
+	for i := 14; i < nextID; i++ {
+		scores[i] = -10.0
+	}
+
+	special := SpecialTokens{BOS: 1, EOS: 2, PAD: 0, UNK: 0}
+	tok := NewBPETokenizer(vocab, nil, special, false)
+	tok.SetSentencePiece(true) // Must also set addLeadingSpace = true
+	tok.SetScores(scores)
+
+	ids, err := tok.Encode("What is the capital of France?")
+	if err != nil {
+		t.Fatalf("Encode error: %v", err)
+	}
+	// With addLeadingSpace=true, pre-tokenizer produces:
+	//   ["▁What", "▁is", "▁the", "▁capital", "▁of", "▁France?"]
+	// The Viterbi should match ▁What (ID 3) as a single token.
+	// Without addLeadingSpace, "What" has no ▁ prefix and falls back to
+	// character tokens [W, h, a, t] — this was the bug.
+	want := []int{3, 4, 5, 6, 7, 8, 9}
+	if len(ids) != len(want) {
+		t.Fatalf("Encode produced %d tokens %v, want %d tokens %v", len(ids), ids, len(want), want)
+	}
+	for i, id := range ids {
+		if id != want[i] {
+			t.Errorf("[%d] = %d, want %d", i, id, want[i])
+		}
+	}
+}
+
 func TestDecodeSentencePieceBytes(t *testing.T) {
 	tests := []struct {
 		name  string