brody-0125
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 41 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎analysis_options.yaml‎
Lines changed: 0 additions & 1 deletion b/‎analysis_options.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmark/hf_compatibility_benchmark.dart‎
Lines changed: 13 additions & 15 deletions b/‎benchmark/hf_compatibility_benchmark.dart‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎benchmark/performance_benchmark.dart‎
Lines changed: 18 additions & 9 deletions b/‎benchmark/performance_benchmark.dart‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎benchmark/streaming_benchmark.dart‎
Lines changed: 20 additions & 11 deletions b/‎benchmark/streaming_benchmark.dart‎
Lines changed: 20 additions & 11 deletions
diff --git a/‎example/example.dart‎
Lines changed: 1 addition & 1 deletion b/‎example/example.dart‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/dart_sentencepiece_tokenizer.dart‎
Lines changed: 1 addition & 4 deletions b/‎lib/dart_sentencepiece_tokenizer.dart‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎lib/src/sentencepiece/algorithm/unigram_algorithm.dart‎
Lines changed: 2 additions & 6 deletions b/‎lib/src/sentencepiece/algorithm/unigram_algorithm.dart‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎lib/src/sentencepiece/model/protobuf_reader.dart‎
Lines changed: 4 additions & 2 deletions b/‎lib/src/sentencepiece/model/protobuf_reader.dart‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎lib/src/sentencepiece/normalizer/sp_normalizer.dart‎
Lines changed: 5 additions & 2 deletions b/‎lib/src/sentencepiece/normalizer/sp_normalizer.dart‎
Lines changed: 5 additions & 2 deletions
@@ -0,0 +1,41 @@
+name: CI
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dart-lang/setup-dart@v1
+        with:
+          sdk: stable
+      - run: dart pub get
+      - run: dart format --set-exit-if-changed .
+      - run: dart analyze --fatal-infos
+
+  test:
+    name: Test (Dart ${{ matrix.sdk }})
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        sdk: [stable, "3.10.7"]
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dart-lang/setup-dart@v1
+        with:
+          sdk: ${{ matrix.sdk }}
+      - run: dart pub get
+      - run: dart test
@@ -15,7 +15,6 @@ linter:
   rules:
     # Error prevention
     - always_declare_return_types
-    - avoid_returning_null_for_future
     - cancel_subscriptions
     - close_sinks
     - throw_in_finally
 
@@ -179,12 +179,11 @@ Future<void> _runRoundTripTests(
     final normalizedDecoded = decoded.toLowerCase().trim();
 
     if (normalizedDecoded.contains(normalizedOriginal.substring(0, 5))) {
-      results.pass('Round-trip: "${text.substring(0, 20.clamp(0, text.length))}..."');
-    } else {
-      results.fail(
-        'Round-trip: "$text"',
-        'Decoded to "$decoded"',
+      results.pass(
+        'Round-trip: "${text.substring(0, 20.clamp(0, text.length))}..."',
       );
+    } else {
+      results.fail('Round-trip: "$text"', 'Decoded to "$decoded"');
     }
   }
   print('');
@@ -238,7 +237,9 @@ Future<void> _runPerformanceTest(
   if (tokensPerSec >= 500000) {
     results.pass('Throughput >= 500K tokens/sec');
   } else if (tokensPerSec >= 100000) {
-    results.info('Throughput ${_formatNumber(tokensPerSec.round())} (acceptable)');
+    results.info(
+      'Throughput ${_formatNumber(tokensPerSec.round())} (acceptable)',
+    );
     results.passed++;
   } else {
     results.fail(
@@ -302,20 +303,14 @@ class SingleEncodingTestCase {
   final String name;
   final String input;
 
-  const SingleEncodingTestCase({
-    required this.name,
-    required this.input,
-  });
+  const SingleEncodingTestCase({required this.name, required this.input});
 }
 
 class EdgeCaseTestCase {
   final String name;
   final String input;
 
-  const EdgeCaseTestCase({
-    required this.name,
-    required this.input,
-  });
+  const EdgeCaseTestCase({required this.name, required this.input});
 }
 
 const _singleEncodingTestCases = [
@@ -343,7 +338,10 @@ const _singleEncodingTestCases = [
   SingleEncodingTestCase(name: 'Mixed punctuation', input: 'Hello... World!'),
 
   // LLM related text
-  SingleEncodingTestCase(name: 'LLM prompt', input: 'What is machine learning?'),
+  SingleEncodingTestCase(
+    name: 'LLM prompt',
+    input: 'What is machine learning?',
+  ),
   SingleEncodingTestCase(
     name: 'AI sentence',
     input: 'Artificial intelligence is transforming the world.',
 
@@ -120,7 +120,7 @@ Future<void> _runSingleEncodingBenchmark(
       'Machine learning is a subset of artificial intelligence that enables '
           'systems to learn and improve from experience without being '
           'explicitly programmed. Deep learning, a subset of machine learning, '
-          'uses neural networks with many layers.'
+          'uses neural networks with many layers.',
     ),
   ];
 
@@ -169,7 +169,9 @@ Future<void> _runBatchEncodingBenchmark(
     sw.stop();
 
     final totalTokens = results.fold<int>(0, (sum, e) => sum + e.length);
-    final tokensPerMs = (totalTokens / sw.elapsedMilliseconds).toStringAsFixed(0);
+    final tokensPerMs = (totalTokens / sw.elapsedMilliseconds).toStringAsFixed(
+      0,
+    );
     print(
       '  Batch $batchSize: ${sw.elapsedMilliseconds}ms ($tokensPerMs tokens/ms)',
     );
@@ -213,7 +215,7 @@ Future<void> _runParallelBenchmark(SentencePieceTokenizer tokenizer) async {
 
     final speedup = swPar.elapsedMilliseconds > 0
         ? (swSeq.elapsedMilliseconds / swPar.elapsedMilliseconds)
-            .toStringAsFixed(2)
+              .toStringAsFixed(2)
         : 'N/A';
     final seqMs = swSeq.elapsedMilliseconds;
     final parMs = swPar.elapsedMilliseconds;
@@ -316,7 +318,8 @@ Future<void> _runMemoryBenchmark(SentencePieceTokenizer tokenizer) async {
   print('');
 
   // Verify typed array types
-  final isOptimized = encoding.ids is Int32List &&
+  final isOptimized =
+      encoding.ids is Int32List &&
       encoding.typeIds is Uint8List &&
       encoding.attentionMask is Uint8List &&
       encoding.specialTokensMask is Uint8List;
@@ -385,7 +388,8 @@ Future<void> _runJsonSerializationBenchmark(
     tokenizer.toJson();
   }
   sw.stop();
-  final toJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000).toStringAsFixed(2);
+  final toJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000)
+      .toStringAsFixed(2);
   print('  toJson():           ${toJsonAvgMs}ms/call');
 
   // Get JSON once for fromJsonString benchmark
@@ -399,7 +403,8 @@ Future<void> _runJsonSerializationBenchmark(
     TokenizerJsonLoader.fromJsonString(json);
   }
   sw.stop();
-  final fromJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000).toStringAsFixed(2);
+  final fromJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000)
+      .toStringAsFixed(2);
   print('  fromJsonString():   ${fromJsonAvgMs}ms/call');
 
   // Verify round-trip correctness
@@ -430,7 +435,9 @@ Future<void> _runTokenAdditionBenchmark() async {
   print('  Vocab size before:  $originalSize');
   print('  Vocab size after:   ${tokenizer.vocabSize}');
   print('  Time to add 100:    ${sw.elapsedMicroseconds}μs');
-  print('  Per token:          ${(sw.elapsedMicroseconds / added).toStringAsFixed(1)}μs/token');
+  print(
+    '  Per token:          ${(sw.elapsedMicroseconds / added).toStringAsFixed(1)}μs/token',
+  );
 
   // Verify encoding works with added tokens
   final encoding = tokenizer.encode('<token_0> test', addSpecialTokens: false);
@@ -443,7 +450,9 @@ Future<void> _runTokenAdditionBenchmark() async {
     tokenizer.getAddedVocab();
   }
   sw2.stop();
-  print('  getAddedVocab():    ${(sw2.elapsedMicroseconds / 1000).toStringAsFixed(2)}μs/call');
+  print(
+    '  getAddedVocab():    ${(sw2.elapsedMicroseconds / 1000).toStringAsFixed(2)}μs/call',
+  );
   print('');
 }
 
@@ -459,7 +468,7 @@ Future<void> _runTokenizeBenchmark(SentencePieceTokenizer tokenizer) async {
       'Long',
       'Machine learning is a subset of artificial intelligence that enables '
           'systems to learn and improve from experience without being '
-          'explicitly programmed. Deep learning uses neural networks.'
+          'explicitly programmed. Deep learning uses neural networks.',
     ),
   ];
 
 
@@ -16,9 +16,7 @@ void main() async {
   if (modelPath != null) {
     print('Loading model from: $modelPath');
     tokenizer = SentencePieceTokenizer.fromModelFileSync(modelPath);
-    print(
-      'Model loaded: ${tokenizer.vocabSize} tokens',
-    );
+    print('Model loaded: ${tokenizer.vocabSize} tokens');
   } else {
     print('No model file found. Using minimal test model.');
     tokenizer = _createMinimalTokenizer();
@@ -107,11 +105,11 @@ Future<void> _runTextStreamerBenchmark(SentencePieceTokenizer tokenizer) async {
     ('Short (10 tokens)', _generateTokens(tokenizer, 'hello world test', 10)),
     (
       'Medium (100 tokens)',
-      _generateTokens(tokenizer, 'hello world test sentence', 100)
+      _generateTokens(tokenizer, 'hello world test sentence', 100),
     ),
     (
       'Long (500 tokens)',
-      _generateTokens(tokenizer, 'hello world test sentence is', 500)
+      _generateTokens(tokenizer, 'hello world test sentence is', 500),
     ),
   ];
 
@@ -150,12 +148,17 @@ Future<void> _runTextStreamerBenchmark(SentencePieceTokenizer tokenizer) async {
 }
 
 Future<void> _runDecodeVsStreamingBenchmark(
-    SentencePieceTokenizer tokenizer) async {
+  SentencePieceTokenizer tokenizer,
+) async {
   print('-' * 70);
   print('2. BATCH decode() vs STREAMING decode');
   print('-' * 70);
 
-  final tokens = _generateTokens(tokenizer, 'hello world test sentence is', 100);
+  final tokens = _generateTokens(
+    tokenizer,
+    'hello world test sentence is',
+    100,
+  );
 
   // Warmup
   for (var i = 0; i < 10; i++) {
@@ -215,7 +218,8 @@ Future<void> _runDecodeVsStreamingBenchmark(
 }
 
 Future<void> _runCallbackOverheadBenchmark(
-    SentencePieceTokenizer tokenizer) async {
+  SentencePieceTokenizer tokenizer,
+) async {
   print('-' * 70);
   print('3. CALLBACK OVERHEAD');
   print('-' * 70);
@@ -275,7 +279,8 @@ Future<void> _runCallbackOverheadBenchmark(
 }
 
 Future<void> _runWordBoundaryHeuristicsBenchmark(
-    SentencePieceTokenizer tokenizer) async {
+  SentencePieceTokenizer tokenizer,
+) async {
   print('-' * 70);
   print('4. WORD BOUNDARY HEURISTICS');
   print('-' * 70);
@@ -300,7 +305,8 @@ Future<void> _runWordBoundaryHeuristicsBenchmark(
     streamer.end();
 
     print(
-        '  $name: ${encoding.length} tokens -> $emissionCount emissions (${(emissionCount / encoding.length * 100).toStringAsFixed(0)}% ratio)');
+      '  $name: ${encoding.length} tokens -> $emissionCount emissions (${(emissionCount / encoding.length * 100).toStringAsFixed(0)}% ratio)',
+    );
   }
   print('');
 }
@@ -333,7 +339,10 @@ Future<void> _runMemoryBenchmark(SentencePieceTokenizer tokenizer) async {
 
 /// Generate a token sequence by repeating encoding of given text.
 List<int> _generateTokens(
-    SentencePieceTokenizer tokenizer, String text, int targetLength) {
+  SentencePieceTokenizer tokenizer,
+  String text,
+  int targetLength,
+) {
   final encoding = tokenizer.encode(text, addSpecialTokens: false);
   final tokens = <int>[];
   while (tokens.length < targetLength) {
 
@@ -42,7 +42,7 @@ void main() async {
   print('Padded length: ${paddedEncoding.length}'); // 32
 
   // Offset mapping
-  final text = 'Hello world';
+  const text = 'Hello world';
   final enc = tokenizer.encode(text, addSpecialTokens: false);
   for (var i = 0; i < enc.length; i++) {
     final offset = enc.offsets[i];
 
@@ -18,10 +18,7 @@ export 'src/sentencepiece/sentencepiece_tokenizer.dart'
 export 'src/sentencepiece/serialization/huggingface_json.dart'
     show HuggingFaceTokenizerLoader;
 export 'src/sentencepiece/serialization/tokenizer_json.dart'
-    show
-        SentencePieceTokenizerJson,
-        TokenizerJsonLoader,
-        kTokenizerJsonVersion;
+    show SentencePieceTokenizerJson, TokenizerJsonLoader, kTokenizerJsonVersion;
 export 'src/sentencepiece/streaming/base_streamer.dart' show BaseStreamer;
 export 'src/sentencepiece/streaming/text_streamer.dart'
     show TextStreamer, OnFinalizedText;
@@ -117,9 +117,7 @@ class UnigramAlgorithm implements TokenizationAlgorithm {
 
     final codeUnit = text.codeUnitAt(start);
     // Check for high surrogate
-    if (codeUnit >= 0xD800 &&
-        codeUnit <= 0xDBFF &&
-        start + 1 < text.length) {
+    if (codeUnit >= 0xD800 && codeUnit <= 0xDBFF && start + 1 < text.length) {
       final low = text.codeUnitAt(start + 1);
       // Check for low surrogate
       if (low >= 0xDC00 && low <= 0xDFFF) {
@@ -183,9 +181,7 @@ class UnigramAlgorithm implements TokenizationAlgorithm {
 
       if (matches.isNotEmpty) {
         // Use the longest match
-        final best = matches.reduce(
-          (a, b) => a.end > b.end ? a : b,
-        );
+        final best = matches.reduce((a, b) => a.end > b.end ? a : b);
         tokens.add(best.tokenId);
         i = best.end;
       } else if (byteFallback && vocab.hasByteFallback) {
 
@@ -70,7 +70,8 @@ class ProtobufReader {
       throw StateError('Unexpected end of data while reading fixed32');
     }
 
-    final value = _data[_position] |
+    final value =
+        _data[_position] |
         (_data[_position + 1] << 8) |
         (_data[_position + 2] << 16) |
         (_data[_position + 3] << 24);
@@ -116,7 +117,8 @@ class ProtobufReader {
     final length = readVarint();
     if (_position + length > _data.length) {
       throw StateError(
-          'Unexpected end of data while reading bytes (need $length, have ${_data.length - _position})');
+        'Unexpected end of data while reading bytes (need $length, have ${_data.length - _position})',
+      );
     }
 
     final bytes = Uint8List.sublistView(_data, _position, _position + length);
 
@@ -33,7 +33,9 @@ class SpNormalizer {
     }
 
     // Step 2: Add dummy prefix (space at beginning)
-    if (addDummyPrefix && result.isNotEmpty && !_isWhitespace(result.codeUnitAt(0))) {
+    if (addDummyPrefix &&
+        result.isNotEmpty &&
+        !_isWhitespace(result.codeUnitAt(0))) {
       result = ' $result';
     }
 
@@ -105,7 +107,8 @@ class SpNormalizer {
   }
 
   @override
-  String toString() => 'SpNormalizer('
+  String toString() =>
+      'SpNormalizer('
       'addDummyPrefix: $addDummyPrefix, '
       'removeExtraWhitespaces: $removeExtraWhitespaces, '
       'escapeWhitespaces: $escapeWhitespaces)';
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,9 @@ class SpNormalizer {`
`33`	`33`	`}`
`34`	`34`
`35`	`35`	`// Step 2: Add dummy prefix (space at beginning)`
`36`		`- if (addDummyPrefix && result.isNotEmpty && !_isWhitespace(result.codeUnitAt(0))) {`
	`36`	`+ if (addDummyPrefix &&`
	`37`	`+ result.isNotEmpty &&`
	`38`	`+ !_isWhitespace(result.codeUnitAt(0))) {`
`37`	`39`	`result = ' $result';`
`38`	`40`	`}`
`39`	`41`
`@@ -105,7 +107,8 @@ class SpNormalizer {`
`105`	`107`	`}`
`106`	`108`
`107`	`109`	`@override`
`108`		`- String toString() => 'SpNormalizer('`
	`110`	`+ String toString() =>`
	`111`	`+ 'SpNormalizer('`
`109`	`112`	`'addDummyPrefix: $addDummyPrefix, '`
`110`	`113`	`'removeExtraWhitespaces: $removeExtraWhitespaces, '`
`111`	`114`	`'escapeWhitespaces: $escapeWhitespaces)';`