Skip to content

Commit 5efb881

Browse files
brody-0125claude
andauthored
Add CI workflow for Dart project testing and analysis (#19)
* feat: add GitHub Actions CI pipeline Add CI workflow with analyze (format + lint) and test jobs. SDK matrix covers stable and minimum supported version (3.10.7). Closes #9 https://claude.ai/code/session_015mxeVgfv72ATnxeGNGag6z * chore: add permissions and concurrency to CI workflow Add minimal permissions (contents: read) and concurrency group to cancel stale runs on the same PR branch. https://claude.ai/code/session_015mxeVgfv72ATnxeGNGag6z * fix: remove dart format check from CI The existing codebase doesn't conform to dart format rules (23 files affected). The format check was not part of the original requirements in issue #9. Keep only dart analyze for lint checking. https://claude.ai/code/session_015mxeVgfv72ATnxeGNGag6z * style: apply dart format to all files and restore format check in CI Run dart format on 23 files that had formatting violations. Re-add dart format --set-exit-if-changed check to the CI workflow. https://claude.ai/code/session_015mxeVgfv72ATnxeGNGag6z * fix: resolve all dart analyze --fatal-infos issues - Remove deprecated avoid_returning_null_for_future lint rule - Add curly braces to if statement in sentencepiece_tokenizer.dart - Use const constructors where required by prefer_const_constructors - Use const for final variables initialized to constants - Add library directive for dangling doc comments - Use final for local variables per prefer_final_locals All 302 tests pass. dart format and dart analyze --fatal-infos clean. https://claude.ai/code/session_015mxeVgfv72ATnxeGNGag6z --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent 147bd0b commit 5efb881

26 files changed

Lines changed: 476 additions & 351 deletions

.github/workflows/ci.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
name: CI
2+
3+
on:
4+
push:
5+
branches: [main, develop]
6+
pull_request:
7+
8+
permissions:
9+
contents: read
10+
11+
concurrency:
12+
group: ${{ github.workflow }}-${{ github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
analyze:
17+
name: Analyze
18+
runs-on: ubuntu-latest
19+
steps:
20+
- uses: actions/checkout@v4
21+
- uses: dart-lang/setup-dart@v1
22+
with:
23+
sdk: stable
24+
- run: dart pub get
25+
- run: dart format --set-exit-if-changed .
26+
- run: dart analyze --fatal-infos
27+
28+
test:
29+
name: Test (Dart ${{ matrix.sdk }})
30+
runs-on: ubuntu-latest
31+
strategy:
32+
matrix:
33+
sdk: [stable, "3.10.7"]
34+
fail-fast: false
35+
steps:
36+
- uses: actions/checkout@v4
37+
- uses: dart-lang/setup-dart@v1
38+
with:
39+
sdk: ${{ matrix.sdk }}
40+
- run: dart pub get
41+
- run: dart test

analysis_options.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ linter:
1515
rules:
1616
# Error prevention
1717
- always_declare_return_types
18-
- avoid_returning_null_for_future
1918
- cancel_subscriptions
2019
- close_sinks
2120
- throw_in_finally

benchmark/hf_compatibility_benchmark.dart

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -179,12 +179,11 @@ Future<void> _runRoundTripTests(
179179
final normalizedDecoded = decoded.toLowerCase().trim();
180180

181181
if (normalizedDecoded.contains(normalizedOriginal.substring(0, 5))) {
182-
results.pass('Round-trip: "${text.substring(0, 20.clamp(0, text.length))}..."');
183-
} else {
184-
results.fail(
185-
'Round-trip: "$text"',
186-
'Decoded to "$decoded"',
182+
results.pass(
183+
'Round-trip: "${text.substring(0, 20.clamp(0, text.length))}..."',
187184
);
185+
} else {
186+
results.fail('Round-trip: "$text"', 'Decoded to "$decoded"');
188187
}
189188
}
190189
print('');
@@ -238,7 +237,9 @@ Future<void> _runPerformanceTest(
238237
if (tokensPerSec >= 500000) {
239238
results.pass('Throughput >= 500K tokens/sec');
240239
} else if (tokensPerSec >= 100000) {
241-
results.info('Throughput ${_formatNumber(tokensPerSec.round())} (acceptable)');
240+
results.info(
241+
'Throughput ${_formatNumber(tokensPerSec.round())} (acceptable)',
242+
);
242243
results.passed++;
243244
} else {
244245
results.fail(
@@ -302,20 +303,14 @@ class SingleEncodingTestCase {
302303
final String name;
303304
final String input;
304305

305-
const SingleEncodingTestCase({
306-
required this.name,
307-
required this.input,
308-
});
306+
const SingleEncodingTestCase({required this.name, required this.input});
309307
}
310308

311309
class EdgeCaseTestCase {
312310
final String name;
313311
final String input;
314312

315-
const EdgeCaseTestCase({
316-
required this.name,
317-
required this.input,
318-
});
313+
const EdgeCaseTestCase({required this.name, required this.input});
319314
}
320315

321316
const _singleEncodingTestCases = [
@@ -343,7 +338,10 @@ const _singleEncodingTestCases = [
343338
SingleEncodingTestCase(name: 'Mixed punctuation', input: 'Hello... World!'),
344339

345340
// LLM related text
346-
SingleEncodingTestCase(name: 'LLM prompt', input: 'What is machine learning?'),
341+
SingleEncodingTestCase(
342+
name: 'LLM prompt',
343+
input: 'What is machine learning?',
344+
),
347345
SingleEncodingTestCase(
348346
name: 'AI sentence',
349347
input: 'Artificial intelligence is transforming the world.',

benchmark/performance_benchmark.dart

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ Future<void> _runSingleEncodingBenchmark(
120120
'Machine learning is a subset of artificial intelligence that enables '
121121
'systems to learn and improve from experience without being '
122122
'explicitly programmed. Deep learning, a subset of machine learning, '
123-
'uses neural networks with many layers.'
123+
'uses neural networks with many layers.',
124124
),
125125
];
126126

@@ -169,7 +169,9 @@ Future<void> _runBatchEncodingBenchmark(
169169
sw.stop();
170170

171171
final totalTokens = results.fold<int>(0, (sum, e) => sum + e.length);
172-
final tokensPerMs = (totalTokens / sw.elapsedMilliseconds).toStringAsFixed(0);
172+
final tokensPerMs = (totalTokens / sw.elapsedMilliseconds).toStringAsFixed(
173+
0,
174+
);
173175
print(
174176
' Batch $batchSize: ${sw.elapsedMilliseconds}ms ($tokensPerMs tokens/ms)',
175177
);
@@ -213,7 +215,7 @@ Future<void> _runParallelBenchmark(SentencePieceTokenizer tokenizer) async {
213215

214216
final speedup = swPar.elapsedMilliseconds > 0
215217
? (swSeq.elapsedMilliseconds / swPar.elapsedMilliseconds)
216-
.toStringAsFixed(2)
218+
.toStringAsFixed(2)
217219
: 'N/A';
218220
final seqMs = swSeq.elapsedMilliseconds;
219221
final parMs = swPar.elapsedMilliseconds;
@@ -316,7 +318,8 @@ Future<void> _runMemoryBenchmark(SentencePieceTokenizer tokenizer) async {
316318
print('');
317319

318320
// Verify typed array types
319-
final isOptimized = encoding.ids is Int32List &&
321+
final isOptimized =
322+
encoding.ids is Int32List &&
320323
encoding.typeIds is Uint8List &&
321324
encoding.attentionMask is Uint8List &&
322325
encoding.specialTokensMask is Uint8List;
@@ -385,7 +388,8 @@ Future<void> _runJsonSerializationBenchmark(
385388
tokenizer.toJson();
386389
}
387390
sw.stop();
388-
final toJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000).toStringAsFixed(2);
391+
final toJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000)
392+
.toStringAsFixed(2);
389393
print(' toJson(): ${toJsonAvgMs}ms/call');
390394

391395
// Get JSON once for fromJsonString benchmark
@@ -399,7 +403,8 @@ Future<void> _runJsonSerializationBenchmark(
399403
TokenizerJsonLoader.fromJsonString(json);
400404
}
401405
sw.stop();
402-
final fromJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000).toStringAsFixed(2);
406+
final fromJsonAvgMs = (sw.elapsedMicroseconds / iterations / 1000)
407+
.toStringAsFixed(2);
403408
print(' fromJsonString(): ${fromJsonAvgMs}ms/call');
404409

405410
// Verify round-trip correctness
@@ -430,7 +435,9 @@ Future<void> _runTokenAdditionBenchmark() async {
430435
print(' Vocab size before: $originalSize');
431436
print(' Vocab size after: ${tokenizer.vocabSize}');
432437
print(' Time to add 100: ${sw.elapsedMicroseconds}μs');
433-
print(' Per token: ${(sw.elapsedMicroseconds / added).toStringAsFixed(1)}μs/token');
438+
print(
439+
' Per token: ${(sw.elapsedMicroseconds / added).toStringAsFixed(1)}μs/token',
440+
);
434441

435442
// Verify encoding works with added tokens
436443
final encoding = tokenizer.encode('<token_0> test', addSpecialTokens: false);
@@ -443,7 +450,9 @@ Future<void> _runTokenAdditionBenchmark() async {
443450
tokenizer.getAddedVocab();
444451
}
445452
sw2.stop();
446-
print(' getAddedVocab(): ${(sw2.elapsedMicroseconds / 1000).toStringAsFixed(2)}μs/call');
453+
print(
454+
' getAddedVocab(): ${(sw2.elapsedMicroseconds / 1000).toStringAsFixed(2)}μs/call',
455+
);
447456
print('');
448457
}
449458

@@ -459,7 +468,7 @@ Future<void> _runTokenizeBenchmark(SentencePieceTokenizer tokenizer) async {
459468
'Long',
460469
'Machine learning is a subset of artificial intelligence that enables '
461470
'systems to learn and improve from experience without being '
462-
'explicitly programmed. Deep learning uses neural networks.'
471+
'explicitly programmed. Deep learning uses neural networks.',
463472
),
464473
];
465474

benchmark/streaming_benchmark.dart

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@ void main() async {
1616
if (modelPath != null) {
1717
print('Loading model from: $modelPath');
1818
tokenizer = SentencePieceTokenizer.fromModelFileSync(modelPath);
19-
print(
20-
'Model loaded: ${tokenizer.vocabSize} tokens',
21-
);
19+
print('Model loaded: ${tokenizer.vocabSize} tokens');
2220
} else {
2321
print('No model file found. Using minimal test model.');
2422
tokenizer = _createMinimalTokenizer();
@@ -107,11 +105,11 @@ Future<void> _runTextStreamerBenchmark(SentencePieceTokenizer tokenizer) async {
107105
('Short (10 tokens)', _generateTokens(tokenizer, 'hello world test', 10)),
108106
(
109107
'Medium (100 tokens)',
110-
_generateTokens(tokenizer, 'hello world test sentence', 100)
108+
_generateTokens(tokenizer, 'hello world test sentence', 100),
111109
),
112110
(
113111
'Long (500 tokens)',
114-
_generateTokens(tokenizer, 'hello world test sentence is', 500)
112+
_generateTokens(tokenizer, 'hello world test sentence is', 500),
115113
),
116114
];
117115

@@ -150,12 +148,17 @@ Future<void> _runTextStreamerBenchmark(SentencePieceTokenizer tokenizer) async {
150148
}
151149

152150
Future<void> _runDecodeVsStreamingBenchmark(
153-
SentencePieceTokenizer tokenizer) async {
151+
SentencePieceTokenizer tokenizer,
152+
) async {
154153
print('-' * 70);
155154
print('2. BATCH decode() vs STREAMING decode');
156155
print('-' * 70);
157156

158-
final tokens = _generateTokens(tokenizer, 'hello world test sentence is', 100);
157+
final tokens = _generateTokens(
158+
tokenizer,
159+
'hello world test sentence is',
160+
100,
161+
);
159162

160163
// Warmup
161164
for (var i = 0; i < 10; i++) {
@@ -215,7 +218,8 @@ Future<void> _runDecodeVsStreamingBenchmark(
215218
}
216219

217220
Future<void> _runCallbackOverheadBenchmark(
218-
SentencePieceTokenizer tokenizer) async {
221+
SentencePieceTokenizer tokenizer,
222+
) async {
219223
print('-' * 70);
220224
print('3. CALLBACK OVERHEAD');
221225
print('-' * 70);
@@ -275,7 +279,8 @@ Future<void> _runCallbackOverheadBenchmark(
275279
}
276280

277281
Future<void> _runWordBoundaryHeuristicsBenchmark(
278-
SentencePieceTokenizer tokenizer) async {
282+
SentencePieceTokenizer tokenizer,
283+
) async {
279284
print('-' * 70);
280285
print('4. WORD BOUNDARY HEURISTICS');
281286
print('-' * 70);
@@ -300,7 +305,8 @@ Future<void> _runWordBoundaryHeuristicsBenchmark(
300305
streamer.end();
301306

302307
print(
303-
' $name: ${encoding.length} tokens -> $emissionCount emissions (${(emissionCount / encoding.length * 100).toStringAsFixed(0)}% ratio)');
308+
' $name: ${encoding.length} tokens -> $emissionCount emissions (${(emissionCount / encoding.length * 100).toStringAsFixed(0)}% ratio)',
309+
);
304310
}
305311
print('');
306312
}
@@ -333,7 +339,10 @@ Future<void> _runMemoryBenchmark(SentencePieceTokenizer tokenizer) async {
333339

334340
/// Generate a token sequence by repeating encoding of given text.
335341
List<int> _generateTokens(
336-
SentencePieceTokenizer tokenizer, String text, int targetLength) {
342+
SentencePieceTokenizer tokenizer,
343+
String text,
344+
int targetLength,
345+
) {
337346
final encoding = tokenizer.encode(text, addSpecialTokens: false);
338347
final tokens = <int>[];
339348
while (tokens.length < targetLength) {

example/example.dart

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ void main() async {
4242
print('Padded length: ${paddedEncoding.length}'); // 32
4343

4444
// Offset mapping
45-
final text = 'Hello world';
45+
const text = 'Hello world';
4646
final enc = tokenizer.encode(text, addSpecialTokens: false);
4747
for (var i = 0; i < enc.length; i++) {
4848
final offset = enc.offsets[i];

lib/dart_sentencepiece_tokenizer.dart

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@ export 'src/sentencepiece/sentencepiece_tokenizer.dart'
1818
export 'src/sentencepiece/serialization/huggingface_json.dart'
1919
show HuggingFaceTokenizerLoader;
2020
export 'src/sentencepiece/serialization/tokenizer_json.dart'
21-
show
22-
SentencePieceTokenizerJson,
23-
TokenizerJsonLoader,
24-
kTokenizerJsonVersion;
21+
show SentencePieceTokenizerJson, TokenizerJsonLoader, kTokenizerJsonVersion;
2522
export 'src/sentencepiece/streaming/base_streamer.dart' show BaseStreamer;
2623
export 'src/sentencepiece/streaming/text_streamer.dart'
2724
show TextStreamer, OnFinalizedText;

lib/src/sentencepiece/algorithm/unigram_algorithm.dart

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,7 @@ class UnigramAlgorithm implements TokenizationAlgorithm {
117117

118118
final codeUnit = text.codeUnitAt(start);
119119
// Check for high surrogate
120-
if (codeUnit >= 0xD800 &&
121-
codeUnit <= 0xDBFF &&
122-
start + 1 < text.length) {
120+
if (codeUnit >= 0xD800 && codeUnit <= 0xDBFF && start + 1 < text.length) {
123121
final low = text.codeUnitAt(start + 1);
124122
// Check for low surrogate
125123
if (low >= 0xDC00 && low <= 0xDFFF) {
@@ -183,9 +181,7 @@ class UnigramAlgorithm implements TokenizationAlgorithm {
183181

184182
if (matches.isNotEmpty) {
185183
// Use the longest match
186-
final best = matches.reduce(
187-
(a, b) => a.end > b.end ? a : b,
188-
);
184+
final best = matches.reduce((a, b) => a.end > b.end ? a : b);
189185
tokens.add(best.tokenId);
190186
i = best.end;
191187
} else if (byteFallback && vocab.hasByteFallback) {

lib/src/sentencepiece/model/protobuf_reader.dart

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@ class ProtobufReader {
7070
throw StateError('Unexpected end of data while reading fixed32');
7171
}
7272

73-
final value = _data[_position] |
73+
final value =
74+
_data[_position] |
7475
(_data[_position + 1] << 8) |
7576
(_data[_position + 2] << 16) |
7677
(_data[_position + 3] << 24);
@@ -116,7 +117,8 @@ class ProtobufReader {
116117
final length = readVarint();
117118
if (_position + length > _data.length) {
118119
throw StateError(
119-
'Unexpected end of data while reading bytes (need $length, have ${_data.length - _position})');
120+
'Unexpected end of data while reading bytes (need $length, have ${_data.length - _position})',
121+
);
120122
}
121123

122124
final bytes = Uint8List.sublistView(_data, _position, _position + length);

lib/src/sentencepiece/normalizer/sp_normalizer.dart

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ class SpNormalizer {
3333
}
3434

3535
// Step 2: Add dummy prefix (space at beginning)
36-
if (addDummyPrefix && result.isNotEmpty && !_isWhitespace(result.codeUnitAt(0))) {
36+
if (addDummyPrefix &&
37+
result.isNotEmpty &&
38+
!_isWhitespace(result.codeUnitAt(0))) {
3739
result = ' $result';
3840
}
3941

@@ -105,7 +107,8 @@ class SpNormalizer {
105107
}
106108

107109
@override
108-
String toString() => 'SpNormalizer('
110+
String toString() =>
111+
'SpNormalizer('
109112
'addDummyPrefix: $addDummyPrefix, '
110113
'removeExtraWhitespaces: $removeExtraWhitespaces, '
111114
'escapeWhitespaces: $escapeWhitespaces)';

0 commit comments

Comments
 (0)