Skip to content

Commit 0d1b102

Browse files
committed
fix: enable addLeadingSpace for SentencePiece unigram models
SetSentencePiece(true) now sets addLeadingSpace=true as a persistent field on BPETokenizer, matching llama.cpp / SentencePiece default behavior. Previously addLeadingSpace was only a parameter passed through the call chain — making it a field ensures the first word always gets the ▁ prefix prepended, so tokens like ▁What are found by the Viterbi DP instead of falling back to character-level tokens. Also adds SetAddLeadingSpace() for GGUF models that override the default via tokenizer.ggml.add_space_prefix metadata.
1 parent 45a5ae0 commit 0d1b102

2 files changed

Lines changed: 90 additions & 3 deletions

File tree

bpe.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ type BPETokenizer struct {
3737
// sentencePiece enables SentencePiece-style pre-tokenization where spaces
3838
// are replaced with ▁ (U+2581) and words are split at ▁ boundaries.
3939
sentencePiece bool
40+
// addLeadingSpace prepends ▁ to the first word during SentencePiece
41+
// pre-tokenization. This is true by default for SentencePiece models,
42+
// matching llama.cpp / SentencePiece behavior.
43+
addLeadingSpace bool
4044
// specialTokens maps special token strings to their IDs for exact matching
4145
// during encoding (e.g., "<start_of_turn>" -> 105).
4246
specialTokens map[string]int
@@ -92,16 +96,17 @@ func (t *BPETokenizer) Encode(text string) ([]int, error) {
9296
}
9397

9498
// encodeSegment tokenizes a text segment that contains no special tokens.
95-
// addLeadingSpace controls whether SentencePiece mode prepends ▁ to the text.
96-
func (t *BPETokenizer) encodeSegment(text string, addLeadingSpace bool) ([]int, error) {
99+
// isFirstSegment indicates this is the first text segment (before any special
100+
// tokens), which determines whether the addLeadingSpace field applies.
101+
func (t *BPETokenizer) encodeSegment(text string, isFirstSegment bool) ([]int, error) {
97102
if text == "" {
98103
return nil, nil
99104
}
100105
var words []string
101106
if t.byteLevelBPE {
102107
words = t.byteLevelPreTokenize(text)
103108
} else if t.sentencePiece {
104-
words = t.sentencePiecePreTokenize(text, addLeadingSpace)
109+
words = t.sentencePiecePreTokenize(text, isFirstSegment && t.addLeadingSpace)
105110
} else {
106111
words = strings.Fields(text)
107112
}
@@ -276,6 +281,15 @@ func (t *BPETokenizer) SpecialTokens() SpecialTokens {
276281
// are replaced with ▁ (U+2581) and the text is split at ▁ boundaries.
277282
func (t *BPETokenizer) SetSentencePiece(enabled bool) {
278283
t.sentencePiece = enabled
284+
t.addLeadingSpace = enabled
285+
}
286+
287+
// SetAddLeadingSpace controls whether SentencePiece mode prepends ▁ to the
288+
// first word. By default this is set to true when SetSentencePiece is called,
289+
// matching llama.cpp / SentencePiece behavior. GGUF models may override this
290+
// via the tokenizer.ggml.add_space_prefix metadata key.
291+
func (t *BPETokenizer) SetAddLeadingSpace(enabled bool) {
292+
t.addLeadingSpace = enabled
279293
}
280294

281295
// SetSpecialTokenStrings registers token strings that should be matched

bpe_test.go

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,79 @@ func TestSentencePieceUnigram_ByteFallbackStillWorksForUnknownChars(t *testing.T
938938
}
939939
}
940940

941+
func TestSentencePieceUnigram_AddLeadingSpaceDefault(t *testing.T) {
942+
// Regression test: SetSentencePiece(true) must enable addLeadingSpace so
943+
// the Viterbi receives "▁What" (7 bytes) as input rather than "What" (4 bytes).
944+
// Without addLeadingSpace, the ▁ prefix is missing and the Viterbi produces
945+
// byte-level or character-level fallback tokens instead of matching "▁What".
946+
vocab := map[string]int{
947+
"<unk>": 0,
948+
"<s>": 1,
949+
"</s>": 2,
950+
"\u2581What": 3,
951+
"\u2581is": 4,
952+
"\u2581the": 5,
953+
"\u2581capital": 6,
954+
"\u2581of": 7,
955+
"\u2581France": 8,
956+
"?": 9,
957+
"W": 10,
958+
"h": 11,
959+
"a": 12,
960+
"t": 13,
961+
}
962+
// Add byte fallback tokens.
963+
nextID := 14
964+
for b := 0; b < 256; b++ {
965+
tok := fmt.Sprintf("<0x%02X>", b)
966+
vocab[tok] = nextID
967+
nextID++
968+
}
969+
970+
scores := make([]float32, nextID)
971+
scores[0] = -100
972+
scores[1] = -100
973+
scores[2] = -100
974+
scores[3] = -2.0 // ▁What
975+
scores[4] = -2.0 // ▁is
976+
scores[5] = -2.0 // ▁the
977+
scores[6] = -2.0 // ▁capital
978+
scores[7] = -2.0 // ▁of
979+
scores[8] = -2.0 // ▁France
980+
scores[9] = -3.0 // ?
981+
scores[10] = -5.0 // W
982+
scores[11] = -5.0 // h
983+
scores[12] = -5.0 // a
984+
scores[13] = -5.0 // t
985+
for i := 14; i < nextID; i++ {
986+
scores[i] = -10.0
987+
}
988+
989+
special := SpecialTokens{BOS: 1, EOS: 2, PAD: 0, UNK: 0}
990+
tok := NewBPETokenizer(vocab, nil, special, false)
991+
tok.SetSentencePiece(true) // Must also set addLeadingSpace = true
992+
tok.SetScores(scores)
993+
994+
ids, err := tok.Encode("What is the capital of France?")
995+
if err != nil {
996+
t.Fatalf("Encode error: %v", err)
997+
}
998+
// With addLeadingSpace=true, pre-tokenizer produces:
999+
// ["▁What", "▁is", "▁the", "▁capital", "▁of", "▁France?"]
1000+
// The Viterbi should match ▁What (ID 3) as a single token.
1001+
// Without addLeadingSpace, "What" has no ▁ prefix and falls back to
1002+
// character tokens [W, h, a, t] — this was the bug.
1003+
want := []int{3, 4, 5, 6, 7, 8, 9}
1004+
if len(ids) != len(want) {
1005+
t.Fatalf("Encode produced %d tokens %v, want %d tokens %v", len(ids), ids, len(want), want)
1006+
}
1007+
for i, id := range ids {
1008+
if id != want[i] {
1009+
t.Errorf("[%d] = %d, want %d", i, id, want[i])
1010+
}
1011+
}
1012+
}
1013+
9411014
func TestDecodeSentencePieceBytes(t *testing.T) {
9421015
tests := []struct {
9431016
name string

0 commit comments

Comments
 (0)