-
Notifications
You must be signed in to change notification settings - Fork 13.4k
feat(message-parser): PoC O(N) block splitter + benchmark baseline #39380
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Harshit2405-2004
wants to merge
18
commits into
RocketChat:develop
Choose a base branch
from
Harshit2405-2004:feat/message-parser-poc
base: develop
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+309
−0
Open
Changes from 13 commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
d79d30b
test(message-parser): add benchmark suite to baseline Peggy performance
Harshit2405-2004 ffd0278
feat(message-parser): implement BlockSplitter PoC (Layer 1)
Harshit2405-2004 0c7daa2
test(message-parser): add skip-flag regression tests to document comp…
Harshit2405-2004 8ae0719
test(message-parser): expand benchmark suite with mixed and pathologi…
Harshit2405-2004 04c8fcd
refactor(message-parser): fix list segmentation and preserve syntax i…
Harshit2405-2004 25a81ec
refactor(message-parser): address refined review comments for BlockSp…
Harshit2405-2004 796b681
Merge branch 'develop' into refactor/message-parser-block-splitter
Harshit2405-2004 08d1dc4
refactor(message-parser): final verified fixes for BlockSplitter revi…
Harshit2405-2004 7874fff
refactor(message-parser): address all review comments for BlockSplitt…
Harshit2405-2004 2f80b11
Update packages/message-parser/benchmarks/parser.bench.ts
Harshit2405-2004 4bee6db
Merge branch 'develop' into test/message-parser-benchmark-suite
Harshit2405-2004 1bccb50
refactor(message-parser): handle mixed list ordering and restore regr…
Harshit2405-2004 3b3722b
chore: consolidate parser PoC
Harshit2405-2004 a76d0f5
Fix: Address cubic and coderabbit parser observations
Harshit2405-2004 1ae425c
Update packages/message-parser/tests/skip-flags-regression.spec.ts
Harshit2405-2004 1923a6a
Update packages/message-parser/tests/skip-flags-regression.spec.ts
Harshit2405-2004 81745c2
Merge branch 'develop' into feat/message-parser-poc
Harshit2405-2004 e753deb
Merge branch 'develop' into feat/message-parser-poc
Harshit2405-2004 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| --- | ||
| "@rocket.chat/message-parser": patch | ||
| --- | ||
|
|
||
| feat(message-parser): implement BlockSplitter PoC (Layer 1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,122 @@ | ||
| export enum BlockType { | ||
| PARAGRAPH = 'PARAGRAPH', | ||
| HEADING = 'HEADING', | ||
| CODE = 'CODE', | ||
| LIST = 'LIST', | ||
| QUOTE = 'QUOTE', | ||
| } | ||
|
|
||
| export type Block = { | ||
| type: BlockType; | ||
| content: string; | ||
| level?: number; | ||
| language?: string; | ||
| ordered?: boolean; | ||
| }; | ||
|
|
||
| export class BlockSplitter { | ||
| public static split(input: string): Block[] { | ||
| const lines = input.split(/\r?\n/); | ||
| const blocks: Block[] = []; | ||
| let currentBlock: Block | null = null; | ||
|
|
||
| for (let i = 0; i < lines.length; i++) { | ||
| const line = lines[i]; | ||
|
|
||
| const headingMatch = line.match(/^(#{1,6})\s+(.+)$/); | ||
| if (headingMatch) { | ||
| this.flush(blocks, currentBlock); | ||
| currentBlock = { | ||
| type: BlockType.HEADING, | ||
| content: headingMatch[2], | ||
| level: headingMatch[1].length, | ||
| }; | ||
| this.flush(blocks, currentBlock); | ||
| currentBlock = null; | ||
| continue; | ||
| } | ||
|
|
||
| if (line.startsWith('```')) { | ||
| this.flush(blocks, currentBlock); | ||
| const language = line.slice(3).trim(); | ||
| const codeLines = []; | ||
| i++; | ||
| while (i < lines.length && !lines[i].startsWith('```')) { | ||
| codeLines.push(lines[i]); | ||
| i++; | ||
| } | ||
| blocks.push({ | ||
| type: BlockType.CODE, | ||
| content: codeLines.join('\n'), | ||
| language, | ||
| }); | ||
| currentBlock = null; | ||
| continue; | ||
| } | ||
|
|
||
| const listMatch = line.match(/^(\s*)([-*+]|\d+\.)\s+(.+)$/); | ||
| const isIndented = /^\s+/.test(line); | ||
|
|
||
| if (listMatch) { | ||
| const isOrdered = /^\d+\./.test(listMatch[2]); | ||
| if (currentBlock?.type !== BlockType.LIST) { | ||
| this.flush(blocks, currentBlock); | ||
| currentBlock = { | ||
| type: BlockType.LIST, | ||
| content: line, | ||
| ordered: isOrdered, | ||
| }; | ||
| } else { | ||
| if (currentBlock.ordered !== undefined && currentBlock.ordered !== isOrdered) { | ||
| currentBlock.ordered = undefined; | ||
| } | ||
| currentBlock.content += `\n${line}`; | ||
| } | ||
| continue; | ||
| } | ||
|
|
||
| if (isIndented && currentBlock?.type === BlockType.LIST) { | ||
| currentBlock.content += `\n${line}`; | ||
| continue; | ||
| } | ||
|
|
||
| if (line.startsWith('>')) { | ||
| if (currentBlock?.type !== BlockType.QUOTE) { | ||
| this.flush(blocks, currentBlock); | ||
| currentBlock = { | ||
| type: BlockType.QUOTE, | ||
| content: line, | ||
| }; | ||
| } else { | ||
| currentBlock.content += `\n${line}`; | ||
| } | ||
| continue; | ||
| } | ||
|
|
||
| if (line.trim() === '') { | ||
| this.flush(blocks, currentBlock); | ||
| currentBlock = null; | ||
| continue; | ||
| } | ||
|
|
||
| if (currentBlock?.type !== BlockType.PARAGRAPH) { | ||
| this.flush(blocks, currentBlock); | ||
| currentBlock = { | ||
| type: BlockType.PARAGRAPH, | ||
| content: line, | ||
| }; | ||
| } else { | ||
| currentBlock.content += `\n${line}`; | ||
| } | ||
| } | ||
|
|
||
| this.flush(blocks, currentBlock); | ||
| return blocks; | ||
| } | ||
|
|
||
| private static flush(blocks: Block[], block: Block | null) { | ||
| if (block) { | ||
| blocks.push(block); | ||
| } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| import { BlockSplitter, BlockType } from '../src/BlockSplitter'; | ||
|
|
||
| describe('BlockSplitter', () => { | ||
| it('should split simple paragraphs', () => { | ||
| const input = 'Hello\nWorld'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].type).toBe(BlockType.PARAGRAPH); | ||
| expect(blocks[0].content).toBe('Hello\nWorld'); | ||
| }); | ||
|
|
||
| it('should identify headings', () => { | ||
| const input = '# Heading 1\n## Heading 2\nContent'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(3); | ||
| expect(blocks[0].type).toBe(BlockType.HEADING); | ||
| expect(blocks[0].level).toBe(1); | ||
| expect(blocks[1].type).toBe(BlockType.HEADING); | ||
| expect(blocks[1].level).toBe(2); | ||
| }); | ||
|
|
||
| it('should identify code blocks', () => { | ||
| const input = 'Pre\n```javascript\nconst a = 1;\n```\nPost'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(3); | ||
| expect(blocks[1].type).toBe(BlockType.CODE); | ||
| expect(blocks[1].language).toBe('javascript'); | ||
| expect(blocks[1].content).toBe('const a = 1;'); | ||
| }); | ||
|
|
||
| it('should handle list splitting and preserve full syntax', () => { | ||
| const input = '- item 1\n* item 2\n1. item 3'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].type).toBe(BlockType.LIST); | ||
| expect(blocks[0].content).toBe('- item 1\n* item 2\n1. item 3'); | ||
| }); | ||
|
|
||
| it('should handle nested lists via indentation', () => { | ||
| const input = '- Level 1\n - Level 2\n - Level 3'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].content).toBe('- Level 1\n - Level 2\n - Level 3'); | ||
| }); | ||
|
|
||
| it('should allow indented blank lines to continue a list', () => { | ||
| const input = '- item 1\n \n- item 2'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].content).toBe('- item 1\n \n- item 2'); | ||
| }); | ||
|
|
||
| it('should correctly detect boundaries: list followed by heading', () => { | ||
| const input = '- list item\n\n# Heading'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(2); | ||
| expect(blocks[0].type).toBe(BlockType.LIST); | ||
| expect(blocks[1].type).toBe(BlockType.HEADING); | ||
| }); | ||
|
|
||
| it('should identify blockquotes and preserve markers', () => { | ||
| const input = '> quote line 1\n> quote line 2'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].type).toBe(BlockType.QUOTE); | ||
| expect(blocks[0].content).toBe('> quote line 1\n> quote line 2'); | ||
| }); | ||
|
|
||
| it('should support nested blockquotes', () => { | ||
| const input = '> outer\n>> inner'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].type).toBe(BlockType.QUOTE); | ||
| expect(blocks[0].content).toBe('> outer\n>> inner'); | ||
| }); | ||
|
|
||
| it('should set ordered to undefined for mixed ordered and unordered list items', () => { | ||
| const input = '- unordered\n1. ordered'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].type).toBe(BlockType.LIST); | ||
| expect(blocks[0].ordered).toBeUndefined(); | ||
| }); | ||
|
|
||
| it('should keep ordered=true for fully ordered lists', () => { | ||
| const input = '1. first\n2. second'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].type).toBe(BlockType.LIST); | ||
| expect(blocks[0].ordered).toBe(true); | ||
| }); | ||
|
|
||
| it('should keep ordered=false for fully unordered lists', () => { | ||
| const input = '- first\n* second'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(1); | ||
| expect(blocks[0].type).toBe(BlockType.LIST); | ||
| expect(blocks[0].ordered).toBe(false); | ||
| }); | ||
|
|
||
| it('should create a new paragraph block after a list block', () => { | ||
| const input = '- list item\n\nParagraph text'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(2); | ||
| expect(blocks[0].type).toBe(BlockType.LIST); | ||
| expect(blocks[1].type).toBe(BlockType.PARAGRAPH); | ||
| expect(blocks[1].content).toBe('Paragraph text'); | ||
| }); | ||
|
|
||
| it('should create a new paragraph block after a quote block', () => { | ||
| const input = '> blockquote\n\nParagraph text'; | ||
| const blocks = BlockSplitter.split(input); | ||
| expect(blocks.length).toBe(2); | ||
| expect(blocks[0].type).toBe(BlockType.QUOTE); | ||
| expect(blocks[1].type).toBe(BlockType.PARAGRAPH); | ||
| expect(blocks[1].content).toBe('Paragraph text'); | ||
| }); | ||
| }); |
32 changes: 32 additions & 0 deletions
32
packages/message-parser/tests/skip-flags-regression.spec.ts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| import { parse } from '../src'; | ||
|
|
||
| describe('Skip Flags Regression (Complexity Audit)', () => { | ||
| const measureDepth = (depth: number) => { | ||
| const input = '*'.repeat(depth) + 'text' + '*'.repeat(depth); | ||
| const start = performance.now(); | ||
| parse(input); | ||
| return performance.now() - start; | ||
| }; | ||
|
|
||
| it('should demonstrate non-linear growth with nested formatting', () => { | ||
| const times: Record<number, number> = {}; | ||
| for (let d = 1; d <= 7; d++) { | ||
| times[d] = measureDepth(d); | ||
| } | ||
|
|
||
| console.table(Object.entries(times).map(([depth, time]) => ({ depth, 'time (ms)': time.toFixed(4) }))); | ||
|
|
||
| // If d=7 takes significantly longer than linear growth from d=1 | ||
| // we have confirmed the problem. | ||
| expect(times[7]).toBeDefined(); | ||
| }); | ||
|
|
||
| it('should handle pathological unmatched markers without crashing', () => { | ||
| const pathological = '*_~*_~*_~*_~*_~ hello'.repeat(5); | ||
| const start = performance.now(); | ||
| parse(pathological); | ||
| const duration = performance.now() - start; | ||
| console.log(`Pathological unmatched markers (5x): ${duration.toFixed(2)}ms`); | ||
| expect(duration).toBeLessThan(1000); // Should still finish within 1s | ||
| }); | ||
| }); |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.