Skip to content

Commit 3057be5

Browse files
jitsedesmetclaude
andauthored
Update static 1.2 spec - rdf-tests 348 (#150)
* init new tests * fix(sparql-1-2): single-pass UCHAR+ECHAR decoding in string literals Replace the two-pass approach (decode UCHAR then ECHAR separately) with a single-pass regex that handles both in one scan. The SPARQL 1.2 spec states "the character resulting from the codepoint escape sequence is not further interpreted", so a backslash produced by \ must not be re-processed as an ECHAR prefix. Remove codepoint-esc-05-bad and codepoint-esc-06-bad negative tests that were testing the incorrect two-pass behavior. Update AST snapshots and generated SPARQL files for codepoint-esc-05 through codepoint-esc-09. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * blep * cleanup * no breaking changes --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent a091227 commit 3057be5

25 files changed

Lines changed: 478 additions & 193 deletions

packages/rules-sparql-1-2/lib/grammar.ts

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import { traqulaIndentation } from '@traqula/core';
99
import { CommonIRIs, funcExpr1, funcExpr3, gram as S11, lex as l11 } from '@traqula/rules-sparql-1-1';
1010
import type * as T11 from '@traqula/rules-sparql-1-1';
1111
import * as l12 from './lexer.js';
12+
import { decodeUchar } from './parserUtils.js';
1213
import type { SparqlGeneratorRule, SparqlGrammarRule, SparqlRule } from './sparql12HelperTypes.js';
1314
import type {
1415
Annotation,
@@ -739,9 +740,9 @@ export const rdfLiteral: SparqlGrammarRule<'rdfLiteral', RuleDefReturn<typeof S1
739740
* [[155]](https://www.w3.org/TR/sparql12-query/#rString)
740741
*
741742
* Uses the SPARQL 1.2 string tokens (which include UCHAR in their patterns).
742-
* Applies two-pass decoding per the SPARQL 1.2 spec: UCHAR first (via
743-
* {@link SparqlContext.codepointEscape}), then ECHAR. This order ensures that
744-
* `\\u0041` yields `\A` (→ invalid ECHAR) rather than `\u0041` (single-pass result).
743+
* Applies single-pass decoding of UCHAR and ECHAR sequences so that a backslash
744+
* produced by a UCHAR (e.g. \u005C → \) is never re-interpreted as an ECHAR prefix.
745+
* Per the SPARQL 1.2 spec exapmles: \u005Cn -> \n
745746
*/
746747
export const string: SparqlGrammarRule<'string', T11.TermLiteralStr> = {
747748
name: 'string',
@@ -767,9 +768,8 @@ export const string: SparqlGrammarRule<'string', T11.TermLiteralStr> = {
767768
return ACTION(() => {
768769
const [ token, raw ] = tuple;
769770
const F = C.astFactory;
770-
// Pass 1: Decode all UCHAR escape sequences (and reject surrogate code points).
771-
const afterUchar = C.codepointEscape(raw);
772-
// Pass 2: Validate and decode ECHAR sequences in the UCHAR-decoded string.
771+
// Single-pass: decode UCHAR and ECHAR together so a backslash from a UCHAR
772+
// is never re-processed as an ECHAR prefix.
773773
const ecmap: Record<string, string> = {
774774
t: '\t',
775775
n: '\n',
@@ -780,15 +780,15 @@ export const string: SparqlGrammarRule<'string', T11.TermLiteralStr> = {
780780
'\'': '\'',
781781
'\\': '\\',
782782
};
783-
const value = afterUchar.replaceAll(/\\(.?)/gsu, (_, char: string) => {
784-
if (!char) {
785-
throw new Error(`String literal ends with an unpaired backslash`);
786-
}
787-
if (!(char in ecmap)) {
788-
throw new Error(`Invalid escape sequence \\${char} in string literal`);
789-
}
790-
return ecmap[char];
791-
});
783+
const value = raw.replaceAll(
784+
/\\u([\dA-Fa-f]{4})|\\U([\dA-Fa-f]{8})|\\(.)/gsu,
785+
(_, u4: string | undefined, u8: string | undefined, echar: string | undefined) => {
786+
if (u4 !== undefined || u8 !== undefined) {
787+
return decodeUchar((u4 ?? u8)!);
788+
}
789+
return ecmap[echar!];
790+
},
791+
);
792792
// Catch literal surrogate code units embedded directly in the query (not via \uXXXX).
793793
if (/[\uD800-\uDBFF](?:[^\uDC00-\uDFFF]|$)/u.test(value)) {
794794
throw new Error(`Invalid unicode codepoint of surrogate pair without corresponding codepoint`);
@@ -811,7 +811,12 @@ export const iriFull: SparqlGrammarRule<'iriFull', T11.TermIriFull> = {
811811
const iriToken = CONSUME(l12.iriRef);
812812
return ACTION(() => {
813813
const raw = iriToken.image.slice(1, -1);
814-
return C.astFactory.termNamed(C.astFactory.sourceLocation(iriToken), C.codepointEscape(raw));
814+
return C.astFactory.termNamed(
815+
C.astFactory.sourceLocation(iriToken),
816+
// TODO: next major replace with implementation of codePointEscape.
817+
// The function no longer serves the intended purpose since it is not reusable for `string`.
818+
C.codepointEscape(raw),
819+
);
815820
});
816821
},
817822
};

packages/rules-sparql-1-2/lib/parserUtils.ts

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,24 @@ import type { Sparql12Nodes } from './sparql12Types.js';
99
*
1010
* Unlike the SPARQL 1.1 variant, this function rejects surrogate code points (U+D800–U+DFFF)
1111
* even when they would form a valid surrogate pair.
12+
* @deprecated will be removed in next MAJOR in favor of the less usecase dependent {@link decodeUchar}.
1213
*/
1314
export function sparql12CodepointEscape(input: string): string {
1415
return input.replaceAll(
1516
/\\u([0-9a-fA-F]{4})|\\U([0-9a-fA-F]{8})/gu,
16-
(_, unicode4: string | undefined, unicode8: string | undefined) => {
17-
const hex = (unicode4 ?? unicode8)!;
18-
const codePoint = Number.parseInt(hex, 16);
19-
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
20-
throw new Error(`Illegal codepoint escape: surrogate code point U+${hex.toUpperCase()}`);
21-
}
22-
return String.fromCodePoint(codePoint);
23-
},
17+
(_, unicode4: string | undefined, unicode8: string | undefined) =>
18+
decodeUchar((unicode4 ?? unicode8)!),
2419
);
2520
}
2621

22+
export function decodeUchar(hex: string): string {
23+
const codePoint = Number.parseInt(hex, 16);
24+
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
25+
throw new Error(`Illegal codepoint escape: surrogate code point U+${hex.toUpperCase()}`);
26+
}
27+
return String.fromCodePoint(codePoint);
28+
}
29+
2730
export function completeParseContext(
2831
context: Partial<SparqlContext>,
2932
): SparqlContext {
@@ -33,6 +36,9 @@ export function completeParseContext(
3336
prefixes: Object.assign(Object.create(null), context.prefixes),
3437
parseMode: context.parseMode ? new Set(context.parseMode) : new Set([ 'canParseVars', 'canCreateBlankNodes' ]),
3538
skipValidation: context.skipValidation ?? false,
39+
/**
40+
* @deprecated since it cannot be used for string decoding.
41+
*/
3642
codepointEscape: context.codepointEscape ?? sparql12CodepointEscape,
3743
};
3844
}

packages/rules-sparql-1-2/lib/sparql12HelperTypes.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ export type SparqlContext = T11.SparqlContext & {
5757
/**
5858
* Function that decodes UCHAR codepoint escapes (\\uXXXX / \\UXXXXXXXX) within a string.
5959
* In SPARQL 1.2 this is applied per-rule rather than as a query pre-processor.
60+
* @deprecated no longer used since it did not properly implement the decuding of sting literals.
6061
*/
6162
codepointEscape: (input: string) => string;
6263
};

packages/test-utils/statics/ast/ast-source-tracked/sparql-1-2/codepoint-esc-05.json

Lines changed: 32 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,59 +7,44 @@
77
"patterns": [
88
{
99
"type": "pattern",
10-
"subType": "bgp",
11-
"triples": [
10+
"subType": "values",
11+
"variables": [
1212
{
13-
"type": "triple",
14-
"subject": {
13+
"type": "term",
14+
"subType": "variable",
15+
"value": "v",
16+
"loc": {
17+
"sourceLocationType": "source",
18+
"start": 21,
19+
"end": 23
20+
}
21+
}
22+
],
23+
"values": [
24+
{
25+
"v": {
1526
"type": "term",
1627
"subType": "namedNode",
17-
"value": "http://example/abc",
18-
"loc": {
19-
"sourceLocationType": "source",
20-
"start": 53,
21-
"end": 92
22-
}
23-
},
24-
"predicate": {
25-
"type": "term",
26-
"subType": "variable",
27-
"value": "p",
28-
"loc": {
29-
"sourceLocationType": "source",
30-
"start": 93,
31-
"end": 95
32-
}
33-
},
34-
"object": {
35-
"type": "term",
36-
"subType": "variable",
37-
"value": "o",
28+
"value": "http://example/abc👾",
3829
"loc": {
3930
"sourceLocationType": "source",
40-
"start": 96,
41-
"end": 98
31+
"start": 42,
32+
"end": 91
4233
}
43-
},
44-
"annotations": [],
45-
"loc": {
46-
"sourceLocationType": "source",
47-
"start": 53,
48-
"end": 98
4934
}
5035
}
5136
],
5237
"loc": {
5338
"sourceLocationType": "source",
54-
"start": 53,
55-
"end": 100
39+
"start": 14,
40+
"end": 95
5641
}
5742
}
5843
],
5944
"loc": {
6045
"sourceLocationType": "source",
61-
"start": 39,
62-
"end": 102
46+
"start": 10,
47+
"end": 97
6348
}
6449
},
6550
"solutionModifiers": {},
@@ -72,26 +57,28 @@
7257
},
7358
"variables": [
7459
{
75-
"type": "wildcard",
60+
"type": "term",
61+
"subType": "variable",
62+
"value": "v",
7663
"loc": {
7764
"sourceLocationType": "source",
78-
"start": 37,
79-
"end": 38
65+
"start": 7,
66+
"end": 9
8067
}
8168
}
8269
],
8370
"loc": {
8471
"sourceLocationType": "inlinedSource",
85-
"newSource": "# Codepoint escape in an IRI\n\nSELECT * {\n ## /abc\n <http://example/\\u0061\\U00000062\\u0063> ?p ?o .\n}\n",
72+
"newSource": "SELECT ?v {\n VALUES ?v {\n # abc👾\n <http://example/\\u0061\\U00000062\\u0063\\U0001F47E>\n }\n}\n",
8673
"start": 0,
8774
"end": 9007199254740991,
8875
"loc": {
8976
"sourceLocationType": "source",
90-
"start": 30,
91-
"end": 102
77+
"start": 0,
78+
"end": 97
9279
},
93-
"startOnNew": 30,
94-
"endOnNew": 102
80+
"startOnNew": 0,
81+
"endOnNew": 97
9582
},
9683
"type": "query"
9784
}

packages/test-utils/statics/ast/ast-source-tracked/sparql-1-2/codepoint-esc-06.json

Lines changed: 32 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,59 +7,44 @@
77
"patterns": [
88
{
99
"type": "pattern",
10-
"subType": "bgp",
11-
"triples": [
10+
"subType": "values",
11+
"variables": [
1212
{
13-
"type": "triple",
14-
"subject": {
15-
"type": "term",
16-
"subType": "namedNode",
17-
"value": "http://example/abc",
18-
"loc": {
19-
"sourceLocationType": "source",
20-
"start": 56,
21-
"end": 76
22-
}
23-
},
24-
"predicate": {
25-
"type": "term",
26-
"subType": "variable",
27-
"value": "p",
28-
"loc": {
29-
"sourceLocationType": "source",
30-
"start": 77,
31-
"end": 79
32-
}
33-
},
34-
"object": {
13+
"type": "term",
14+
"subType": "variable",
15+
"value": "v",
16+
"loc": {
17+
"sourceLocationType": "source",
18+
"start": 21,
19+
"end": 23
20+
}
21+
}
22+
],
23+
"values": [
24+
{
25+
"v": {
3526
"type": "term",
3627
"subType": "literal",
37-
"value": "abc",
28+
"value": "abc👾",
3829
"loc": {
3930
"sourceLocationType": "source",
40-
"start": 80,
41-
"end": 108
31+
"start": 44,
32+
"end": 82
4233
}
43-
},
44-
"annotations": [],
45-
"loc": {
46-
"sourceLocationType": "source",
47-
"start": 56,
48-
"end": 108
4934
}
5035
}
5136
],
5237
"loc": {
5338
"sourceLocationType": "source",
54-
"start": 56,
55-
"end": 110
39+
"start": 14,
40+
"end": 86
5641
}
5742
}
5843
],
5944
"loc": {
6045
"sourceLocationType": "source",
61-
"start": 41,
62-
"end": 112
46+
"start": 10,
47+
"end": 88
6348
}
6449
},
6550
"solutionModifiers": {},
@@ -72,26 +57,28 @@
7257
},
7358
"variables": [
7459
{
75-
"type": "wildcard",
60+
"type": "term",
61+
"subType": "variable",
62+
"value": "v",
7663
"loc": {
7764
"sourceLocationType": "source",
78-
"start": 39,
79-
"end": 40
65+
"start": 7,
66+
"end": 9
8067
}
8168
}
8269
],
8370
"loc": {
8471
"sourceLocationType": "inlinedSource",
85-
"newSource": "# Codepoint escape in a string\n\nSELECT * {\n ## \"abc\"\n <http://example/abc> ?p \"\\U00000061\\u0062\\U00000063\" .\n}\n",
72+
"newSource": "SELECT ?v {\n VALUES ?v {\n # \"abc👾\"\n \"\\U00000061\\u0062\\U00000063\\U0001F47E\"\n }\n}\n",
8673
"start": 0,
8774
"end": 9007199254740991,
8875
"loc": {
8976
"sourceLocationType": "source",
90-
"start": 32,
91-
"end": 112
77+
"start": 0,
78+
"end": 88
9279
},
93-
"startOnNew": 32,
94-
"endOnNew": 112
80+
"startOnNew": 0,
81+
"endOnNew": 88
9582
},
9683
"type": "query"
9784
}

0 commit comments

Comments
 (0)