Skip to content

Commit 5a387df

Browse files
committed
Optimized HtmlTokenizer.ReadAttributeValueQuoted() and ReadPlainText()
The ReadAttributeValueQuoted() optimization drastically improves performance for the xamarin3.xhtml benchmark. BenchmarkDotNet v0.15.8, Windows 11 (10.0.26200.7392/25H2/2025Update/HudsonValley2) Intel Core Ultra 7 268V 2.20GHz, 1 CPU, 8 logical and 8 physical cores .NET SDK 10.0.101 [Host] : .NET 10.0.1 (10.0.1, 10.0.125.57005), X64 RyuJIT x86-64-v3 DefaultJob : .NET 10.0.1 (10.0.1, 10.0.125.57005), X64 RyuJIT x86-64-v3 Before: | Method | Mean | Error | StdDev | |---------------------------- |----------:|---------:|---------:| | HtmlKit_TextReader_Xamarin3 | 104.38 us | 1.157 us | 1.025 us | | HtmlKit_Stream_Xamarin3 | 96.45 us | 0.654 us | 0.612 us | After: | Method | Mean | Error | StdDev | |---------------------------- |---------:|---------:|---------:| | HtmlKit_TextReader_Xamarin3 | 93.79 us | 0.885 us | 0.828 us | | HtmlKit_Stream_Xamarin3 | 85.62 us | 0.574 us | 0.537 us |
1 parent d37c480 commit 5a387df

File tree

2 files changed

+113
-22
lines changed

2 files changed

+113
-22
lines changed

HtmlKit/CharBuffer.cs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ public void Append (char c)
7171
buffer[Length++] = c;
7272
}
7373

74+
[MethodImpl (MethodImplOptions.AggressiveInlining)]
75+
public void Append (char[] chars, int index, int count)
76+
{
77+
EnsureCapacity (Length + count);
78+
chars.AsSpan (index, count).CopyTo (buffer.AsSpan (Length));
79+
Length += count;
80+
}
81+
7482
[MethodImpl (MethodImplOptions.AggressiveInlining)]
7583
public void Append (string str)
7684
{

HtmlKit/HtmlTokenizer.cs

Lines changed: 105 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,86 @@ bool TryRead (out char c)
534534
return false;
535535
}
536536

537+
bool TryReadDataUntil (ReadOnlySpan<char> specials, out char c)
538+
{
539+
FillBuffer ();
540+
541+
while (bufferIndex < bufferEnd) {
542+
int left = bufferEnd - bufferIndex;
543+
544+
// Note: 'specials' MUST contain '\n' for proper line number tracking...
545+
var span = new ReadOnlySpan<char> (buffer, bufferIndex, left);
546+
int count = span.IndexOfAny (specials);
547+
548+
if (count == -1) {
549+
data.Append (buffer, bufferIndex, left);
550+
bufferIndex += left;
551+
FillBuffer ();
552+
continue;
553+
}
554+
555+
if (count > 0) {
556+
data.Append (buffer, bufferIndex, count);
557+
bufferIndex += count;
558+
}
559+
560+
c = buffer[bufferIndex++];
561+
562+
if (c == '\n') {
563+
IncrementLineNumber ();
564+
} else {
565+
linePosition++;
566+
}
567+
568+
return true;
569+
}
570+
571+
c = '\0';
572+
573+
return false;
574+
}
575+
576+
bool TryReadNameUntil (ReadOnlySpan<char> specials, out char c)
577+
{
578+
FillBuffer ();
579+
580+
while (bufferIndex < bufferEnd) {
581+
int left = bufferEnd - bufferIndex;
582+
583+
// Note: 'specials' MUST contain '\n' for proper line number tracking...
584+
var span = new ReadOnlySpan<char> (buffer, bufferIndex, left);
585+
int count = span.IndexOfAny (specials);
586+
587+
if (count == -1) {
588+
data.Append (buffer, bufferIndex, left);
589+
name.Append (buffer, bufferIndex, left);
590+
bufferIndex += left;
591+
FillBuffer ();
592+
continue;
593+
}
594+
595+
if (count > 0) {
596+
data.Append (buffer, bufferIndex, count);
597+
name.Append (buffer, bufferIndex, count);
598+
bufferIndex += count;
599+
}
600+
601+
c = buffer[bufferIndex++];
602+
603+
if (c == '\n') {
604+
IncrementLineNumber ();
605+
} else {
606+
linePosition++;
607+
}
608+
609+
return true;
610+
}
611+
612+
c = '\0';
613+
614+
return false;
615+
}
616+
537617
bool NameIs (string value)
538618
{
539619
if (name.Length != value.Length)
@@ -802,6 +882,10 @@ HtmlToken EmitTagToken ()
802882
// 8.2.4.1 Data state
803883
HtmlToken? ReadData ()
804884
{
885+
//ReadOnlySpan<char> specials = DecodeCharacterReferences ?
886+
// stackalloc char[] { '\n', '&', '<' } :
887+
// stackalloc char[] { '\n', '<' };
888+
805889
do {
806890
if (!TryRead (out char c)) {
807891
TokenizerState = HtmlTokenizerState.EndOfFile;
@@ -838,6 +922,10 @@ HtmlToken EmitTagToken ()
838922
// 8.2.4.3 RCDATA state
839923
HtmlToken? ReadRcData ()
840924
{
925+
//ReadOnlySpan<char> specials = DecodeCharacterReferences ?
926+
// stackalloc char[] { '\0', '\n', '&', '<' } :
927+
// stackalloc char[] { '\0', '\n', '<' };
928+
841929
do {
842930
if (!TryRead (out char c)) {
843931
TokenizerState = HtmlTokenizerState.EndOfFile;
@@ -873,6 +961,8 @@ HtmlToken EmitTagToken ()
873961
// 8.2.4.5 RAWTEXT state
874962
HtmlToken? ReadRawText ()
875963
{
964+
//ReadOnlySpan<char> specials = stackalloc char[] { '\0', '\n', '<' };
965+
876966
do {
877967
if (!TryRead (out char c)) {
878968
TokenizerState = HtmlTokenizerState.EndOfFile;
@@ -895,6 +985,8 @@ HtmlToken EmitTagToken ()
895985
// 8.2.4.6 Script data state
896986
HtmlToken? ReadScriptData ()
897987
{
988+
//ReadOnlySpan<char> specials = stackalloc char[] { '\0', '\n', '<' };
989+
898990
do {
899991
if (!TryRead (out char c)) {
900992
TokenizerState = HtmlTokenizerState.EndOfFile;
@@ -917,29 +1009,16 @@ HtmlToken EmitTagToken ()
9171009
// 8.2.4.7 PLAINTEXT state
9181010
HtmlToken? ReadPlainText ()
9191011
{
920-
do {
921-
while (bufferIndex < bufferEnd) {
922-
char c = buffer[bufferIndex++];
923-
924-
linePosition++;
1012+
ReadOnlySpan<char> specials = stackalloc char[] { '\0', '\n' };
9251013

926-
switch (c) {
927-
case '\0':
928-
data.Append ('\uFFFD');
929-
break;
930-
case '\n':
931-
IncrementLineNumber ();
932-
goto default;
933-
default:
934-
data.Append (c);
935-
break;
936-
}
1014+
do {
1015+
if (!TryReadDataUntil (specials, out char c)) {
1016+
TokenizerState = HtmlTokenizerState.EndOfFile;
1017+
break;
9371018
}
9381019

939-
FillBuffer ();
940-
} while (!eof);
941-
942-
TokenizerState = HtmlTokenizerState.EndOfFile;
1020+
data.Append (c == '\0' ? '\uFFFD' : c);
1021+
} while (true);
9431022

9441023
return EmitDataToken (false, false);
9451024
}
@@ -1018,6 +1097,8 @@ HtmlToken EmitTagToken ()
10181097
// 8.2.4.10 Tag name state
10191098
HtmlToken? ReadTagName ()
10201099
{
1100+
//ReadOnlySpan<char> specials = stackalloc char[] { '\0', '\t', '\r', '\n', '\f', ' ', '/', '>' };
1101+
10211102
do {
10221103
if (!TryRead (out char c)) {
10231104
TokenizerState = HtmlTokenizerState.EndOfFile;
@@ -1719,8 +1800,10 @@ HtmlToken EmitTagToken ()
17191800
// 8.2.4.38 Attribute value (double-quoted) state
17201801
HtmlToken? ReadAttributeValueQuoted ()
17211802
{
1803+
ReadOnlySpan<char> specials = stackalloc char[] { '\0', '\n', '&', quote };
1804+
17221805
do {
1723-
if (!TryRead (out char c)) {
1806+
if (!TryReadNameUntil (specials, out char c)) {
17241807
TokenizerState = HtmlTokenizerState.EndOfFile;
17251808
name.Length = 0;
17261809

@@ -1737,7 +1820,7 @@ HtmlToken EmitTagToken ()
17371820
case '\0':
17381821
name.Append ('\uFFFD');
17391822
break;
1740-
default:
1823+
default: // quote or '\n'
17411824
if (c == quote) {
17421825
TokenizerState = HtmlTokenizerState.AfterAttributeValueQuoted;
17431826
quote = '\0';

0 commit comments

Comments
 (0)