Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 10be1c6

Browse files
authoredOct 24, 2024··
VSCODE-639: Replace regex fragment matching with streaming KMP (#837)
* Replace regex fragment matching with streaming KMP * Simplify content capturing * Add an explanation for FragmentMatcher * Handle multiple code blocks per fragment
1 parent cace4df commit 10be1c6

File tree

2 files changed

+246
-70
lines changed

2 files changed

+246
-70
lines changed
 

‎src/participant/streamParsing.ts

Lines changed: 170 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,166 @@
1-
function escapeRegex(str: string): string {
2-
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1+
// This is a stateful streaming implementation of the Knuth-Morris-Pratt algorithm
2+
// for substring search. It supports being invoked with multiple fragments of the
3+
// haystack and is capable of finding matches spanning multiple fragments.
4+
class StreamingKMP {
5+
public needle: string;
6+
private _lookupVector: number[];
7+
8+
// In cases where we are fed a string that has a suffix that matches a prefix
9+
// of the needle, we're storing the index in the needle which we last matched.
10+
// Then when we get a new haystack, we start matching from that needle.
11+
private _lastMatchingIndex = 0;
12+
13+
constructor(needle: string) {
14+
this.needle = needle;
15+
this._lookupVector = this._createLookupVector();
16+
}
17+
18+
private _createLookupVector(): number[] {
19+
const vector = new Array<number>(this.needle.length);
20+
let j = 0;
21+
vector[0] = 0;
22+
23+
for (let i = 1; i < this.needle.length; i++) {
24+
while (j > 0 && this.needle[i] !== this.needle[j]) {
25+
j = vector[j - 1];
26+
}
27+
28+
if (this.needle[i] === this.needle[j]) {
29+
j++;
30+
}
31+
32+
vector[i] = j;
33+
}
34+
35+
return vector;
36+
}
37+
38+
// Returns the index in the haystackFragment **after** the needle.
39+
// This is done because the match may have occurred over multiple fragments,
40+
// so the index of the needle start would be negative.
41+
public match(haystackFragment: string): number {
42+
let j = this._lastMatchingIndex; // index in needle
43+
let i = 0; // index in haystack
44+
45+
while (i < haystackFragment.length) {
46+
if (haystackFragment[i] === this.needle[j]) {
47+
i++;
48+
j++;
49+
}
50+
51+
if (j === this.needle.length) {
52+
this._lastMatchingIndex = 0;
53+
return i;
54+
}
55+
56+
if (
57+
i < haystackFragment.length &&
58+
haystackFragment[i] !== this.needle[j]
59+
) {
60+
if (j !== 0) {
61+
j = this._lookupVector[j - 1];
62+
} else {
63+
i++;
64+
}
65+
}
66+
}
67+
68+
this._lastMatchingIndex = j;
69+
return -1;
70+
}
71+
72+
public reset(): void {
73+
this._lastMatchingIndex = 0;
74+
}
75+
}
76+
77+
// This class is essentially a state machine that processes a stream of text fragments
78+
// and emitting a callback with the content between each start and end identifier. The
79+
// two states we have are:
80+
// 1. "waiting for start identifier" - `_matchedContent === undefined`
81+
// 2. "waiting for end identifier" - `_matchedContent !== undefined`
82+
// with the state transitioning from one to the other when the corresponding identifier
83+
// is matched in the fragment stream.
84+
class FragmentMatcher {
85+
private _startMatcher: StreamingKMP;
86+
private _endMatcher: StreamingKMP;
87+
private _matchedContent?: string;
88+
private _onContentMatched: (content: string) => void;
89+
private _onFragmentProcessed: (content: string) => void;
90+
91+
constructor({
92+
identifier,
93+
onContentMatched,
94+
onFragmentProcessed,
95+
}: {
96+
identifier: {
97+
start: string;
98+
end: string;
99+
};
100+
onContentMatched: (content: string) => void;
101+
onFragmentProcessed: (content: string) => void;
102+
}) {
103+
this._startMatcher = new StreamingKMP(identifier.start);
104+
this._endMatcher = new StreamingKMP(identifier.end);
105+
this._onContentMatched = onContentMatched;
106+
this._onFragmentProcessed = onFragmentProcessed;
107+
}
108+
109+
private _contentMatched(): void {
110+
const content = this._matchedContent;
111+
if (content !== undefined) {
112+
// Strip the trailing end identifier from the matched content
113+
this._onContentMatched(
114+
content.slice(0, content.length - this._endMatcher.needle.length)
115+
);
116+
}
117+
118+
this._matchedContent = undefined;
119+
this._startMatcher.reset();
120+
this._endMatcher.reset();
121+
}
122+
123+
// This needs to be invoked every time before we call `process` recursively or when `process`
124+
// completes processing the fragment. It will emit a notification to subscribers with the partial
125+
// fragment we've processed, regardless of whether there's a match or not.
126+
private _partialFragmentProcessed(
127+
fragment: string,
128+
index: number | undefined = undefined
129+
): void {
130+
this._onFragmentProcessed(
131+
index === undefined ? fragment : fragment.slice(0, index)
132+
);
133+
}
134+
135+
public process(fragment: string): void {
136+
if (this._matchedContent === undefined) {
137+
// We haven't matched the start identifier yet, so try and do that
138+
const startIndex = this._startMatcher.match(fragment);
139+
if (startIndex !== -1) {
140+
// We found a match for the start identifier - update `_matchedContent` to an empty string
141+
// and recursively call `process` with the remainder of the fragment.
142+
this._matchedContent = '';
143+
this._partialFragmentProcessed(fragment, startIndex);
144+
this.process(fragment.slice(startIndex));
145+
} else {
146+
this._partialFragmentProcessed(fragment);
147+
}
148+
} else {
149+
const endIndex = this._endMatcher.match(fragment);
150+
if (endIndex !== -1) {
151+
// We've matched the end - emit the matched content and continue processing the partial fragment
152+
this._matchedContent += fragment.slice(0, endIndex);
153+
this._partialFragmentProcessed(fragment, endIndex);
154+
this._contentMatched();
155+
this.process(fragment.slice(endIndex));
156+
} else {
157+
// We haven't matched the end yet - append the fragment to the matched content and wait
158+
// for a future fragment to contain the end identifier.
159+
this._matchedContent += fragment;
160+
this._partialFragmentProcessed(fragment);
161+
}
162+
}
163+
}
3164
}
4165

5166
/**
@@ -22,74 +183,13 @@ export async function processStreamWithIdentifiers({
22183
end: string;
23184
};
24185
}): Promise<void> {
25-
const escapedIdentifierStart = escapeRegex(identifier.start);
26-
const escapedIdentifierEnd = escapeRegex(identifier.end);
27-
const regex = new RegExp(
28-
`${escapedIdentifierStart}([\\s\\S]*?)${escapedIdentifierEnd}`,
29-
'g'
30-
);
31-
32-
let contentSinceLastIdentifier = '';
33-
for await (const fragment of inputIterable) {
34-
contentSinceLastIdentifier += fragment;
186+
const fragmentMatcher = new FragmentMatcher({
187+
identifier,
188+
onContentMatched: onStreamIdentifier,
189+
onFragmentProcessed: processStreamFragment,
190+
});
35191

36-
let lastIndex = 0;
37-
let match: RegExpExecArray | null;
38-
while ((match = regex.exec(contentSinceLastIdentifier)) !== null) {
39-
const endIndex = regex.lastIndex;
40-
41-
// Stream content up to the end of the identifier.
42-
const contentToStream = contentSinceLastIdentifier.slice(
43-
lastIndex,
44-
endIndex
45-
);
46-
processStreamFragment(contentToStream);
47-
48-
const identifierContent = match[1];
49-
onStreamIdentifier(identifierContent);
50-
51-
lastIndex = endIndex;
52-
}
53-
54-
if (lastIndex > 0) {
55-
// Remove all of the processed content.
56-
contentSinceLastIdentifier = contentSinceLastIdentifier.slice(lastIndex);
57-
// Reset the regex.
58-
regex.lastIndex = 0;
59-
} else {
60-
// Clear as much of the content as we can safely.
61-
const maxUnprocessedLength = identifier.start.length - 1;
62-
if (contentSinceLastIdentifier.length > maxUnprocessedLength) {
63-
const identifierIndex = contentSinceLastIdentifier.indexOf(
64-
identifier.start
65-
);
66-
if (identifierIndex > -1) {
67-
// We have an identifier, so clear up until the identifier.
68-
const contentToStream = contentSinceLastIdentifier.slice(
69-
0,
70-
identifierIndex
71-
);
72-
processStreamFragment(contentToStream);
73-
contentSinceLastIdentifier =
74-
contentSinceLastIdentifier.slice(identifierIndex);
75-
} else {
76-
// No identifier, so clear up until the last maxUnprocessedLength.
77-
const processUpTo =
78-
contentSinceLastIdentifier.length - maxUnprocessedLength;
79-
const contentToStream = contentSinceLastIdentifier.slice(
80-
0,
81-
processUpTo
82-
);
83-
processStreamFragment(contentToStream);
84-
contentSinceLastIdentifier =
85-
contentSinceLastIdentifier.slice(processUpTo);
86-
}
87-
}
88-
}
89-
}
90-
91-
// Finish up anything not streamed yet.
92-
if (contentSinceLastIdentifier.length > 0) {
93-
processStreamFragment(contentSinceLastIdentifier);
192+
for await (const fragment of inputIterable) {
193+
fragmentMatcher.process(fragment);
94194
}
95195
}

‎src/test/suite/participant/streamParsing.test.ts

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,4 +216,80 @@ suite('processStreamWithIdentifiers', () => {
216216
expect(fragmentsProcessed.join('')).to.deep.equal(inputFragments.join(''));
217217
expect(identifiersStreamed).to.deep.equal(['\ncode1\n', '\ncode2\n']);
218218
});
219+
220+
test('one fragment containing multiple code blocks emits event in correct order', async () => {
221+
// In case we have one fragment containing multiple code blocks, we want to make sure that
222+
// fragment notifications and identifier notifications arrive in the right order so that we're
223+
// adding code actions after the correct subfragment.
224+
// For example:
225+
// 'Text before code.\n```js\ncode1\n```\nText between code.\n```js\ncode2\n```\nText after code.'
226+
//
227+
// should emit:
228+
//
229+
// processStreamFragment: 'Text before code.\n```js\ncode1\n```'
230+
// onStreamIdentifier: '\ncode1\n'
231+
// processStreamFragment: '\nText between code.\n```js\ncode2\n```'
232+
// onStreamIdentifier: '\ncode2\n'
233+
// processStreamFragment: '\nText after code.'
234+
//
235+
// in that order to ensure we add each code action immediately after the code block
236+
// rather than add both at the end.
237+
238+
const inputFragments = [
239+
'Text before code.\n```js\ncode1\n```\nText between code.\n```js\ncode2\n```\nText after code.',
240+
];
241+
242+
const inputIterable = asyncIterableFromArray<string>(inputFragments);
243+
const identifier = { start: '```js', end: '```' };
244+
245+
const fragmentsEmitted: {
246+
source: 'processStreamFragment' | 'onStreamIdentifier';
247+
content: string;
248+
}[] = [];
249+
250+
const getFragmentHandler = (
251+
source: 'processStreamFragment' | 'onStreamIdentifier'
252+
): ((fragment: string) => void) => {
253+
return (fragment: string): void => {
254+
// It's an implementation detail, but the way the code is structured today, we're splitting the emitted fragments
255+
// whenever we find either a start or end identifier. This is irrelevant as long as we're emitting the entirety of
256+
// the text until the end of the code block in `processStreamFragment` and then the code block itself in `onStreamIdentifier`.
257+
// With the code below, we're combining all subfragments with the same source to make the test verify the desired
258+
// behavior rather than the actual implementation.
259+
const lastFragment = fragmentsEmitted[fragmentsEmitted.length - 1];
260+
if (lastFragment?.source === source) {
261+
lastFragment.content += fragment;
262+
} else {
263+
fragmentsEmitted.push({ source, content: fragment });
264+
}
265+
};
266+
};
267+
268+
await processStreamWithIdentifiers({
269+
processStreamFragment: getFragmentHandler('processStreamFragment'),
270+
onStreamIdentifier: getFragmentHandler('onStreamIdentifier'),
271+
inputIterable,
272+
identifier,
273+
});
274+
275+
expect(fragmentsEmitted).to.have.length(5);
276+
expect(fragmentsEmitted[0].source).to.equal('processStreamFragment');
277+
expect(fragmentsEmitted[0].content).to.equal(
278+
'Text before code.\n```js\ncode1\n```'
279+
);
280+
281+
expect(fragmentsEmitted[1].source).to.equal('onStreamIdentifier');
282+
expect(fragmentsEmitted[1].content).to.equal('\ncode1\n');
283+
284+
expect(fragmentsEmitted[2].source).to.equal('processStreamFragment');
285+
expect(fragmentsEmitted[2].content).to.equal(
286+
'\nText between code.\n```js\ncode2\n```'
287+
);
288+
289+
expect(fragmentsEmitted[3].source).to.equal('onStreamIdentifier');
290+
expect(fragmentsEmitted[3].content).to.equal('\ncode2\n');
291+
292+
expect(fragmentsEmitted[4].source).to.equal('processStreamFragment');
293+
expect(fragmentsEmitted[4].content).to.equal('\nText after code.');
294+
});
219295
});

0 commit comments

Comments
 (0)
Please sign in to comment.