Skip to content

Commit 416a68d

Browse files
brody-0125claude
andauthored
docs: add comments clarifying google/sentencepiece proto spec compliance for default token IDs (#21)
Closes #20 https://claude.ai/code/session_01SeqvRT7UqoHg9gCx2j3YV4 Co-authored-by: Claude <noreply@anthropic.com>
1 parent 5efb881 commit 416a68d

4 files changed

Lines changed: 18 additions & 4 deletions

File tree

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,11 @@ final seqIdx = encoding.tokenToSequence(1);
177177

178178
### Vocabulary Access
179179

180+
> **Note:** Default special token IDs (`unkId=0, bosId=1, eosId=2, padId=-1`) follow the
181+
> [google/sentencepiece proto spec](https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto).
182+
> Some models use different values (e.g. Gemma: `pad=0, eos=1, bos=2`), which are
183+
> automatically parsed from the model file at load time.
184+
180185
```dart
181186
print(tokenizer.vocabSize); // 32000
182187
print(tokenizer.vocab.unkId); // 0

lib/src/sentencepiece/model/model_proto.dart

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ class SentencePiece {
6262
}
6363

6464
/// Training specification from the model.
65+
///
66+
/// Default token IDs (unkId=0, bosId=1, eosId=2, padId=-1) comply with the
67+
/// google/sentencepiece proto spec (sentencepiece_model.proto). Some models
68+
/// (e.g. Gemma: pad=0, eos=1, bos=2) use different values, which are
69+
/// correctly parsed from the model file at runtime.
6570
class TrainerSpec {
6671
final ModelType modelType;
6772
final int vocabSize;
@@ -78,6 +83,7 @@ class TrainerSpec {
7883
const TrainerSpec({
7984
this.modelType = ModelType.unigram,
8085
this.vocabSize = 8000,
86+
// Default token IDs per google/sentencepiece proto spec.
8187
this.unkId = 0,
8288
this.bosId = 1,
8389
this.eosId = 2,

lib/src/sentencepiece/model/sentencepiece_model.dart

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ class SentencePieceModelLoader {
120120
static TrainerSpec _parseTrainerSpec(ProtobufReader reader) {
121121
ModelType modelType = ModelType.unigram;
122122
int vocabSize = 8000;
123+
// Defaults per google/sentencepiece proto spec (sentencepiece_model.proto).
124+
// These are overridden by actual values parsed from the model file below.
123125
int unkId = 0;
124126
int bosId = 1;
125127
int eosId = 2;

test/test_utils.dart

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,14 @@ Uint8List _createMinimalTestModel() {
9696
}
9797

9898
// Add trainer_spec (field 2)
99+
// Default token IDs follow the google/sentencepiece proto spec.
99100
final trainerSpec = _ProtobufBuilder();
100101
trainerSpec.writeVarint(1, 1); // model_type = unigram
101102
trainerSpec.writeVarint(3, testPieces.length); // vocab_size
102-
trainerSpec.writeVarint(40, 0); // unk_id
103-
trainerSpec.writeVarint(41, 1); // bos_id
104-
trainerSpec.writeVarint(42, 2); // eos_id
105-
trainerSpec.writeVarint(43, -1); // pad_id (not set)
103+
trainerSpec.writeVarint(40, 0); // unk_id (proto default)
104+
trainerSpec.writeVarint(41, 1); // bos_id (proto default)
105+
trainerSpec.writeVarint(42, 2); // eos_id (proto default)
106+
trainerSpec.writeVarint(43, -1); // pad_id (proto default, -1 = unused)
106107
builder.writeBytes(2, trainerSpec.toBytes());
107108

108109
// Add normalizer_spec (field 3)

0 commit comments

Comments
 (0)