Skip to content

Commit 0b3bcfc

Browse files
committed
Simplify rule_schema_v2.atd, use less variant
This better match the syntax. The goal is not to redo Rule.ml and have clean variant types! The goal is to specify the syntax. test plan: see related PR in semgrep
1 parent ad4eb72 commit 0b3bcfc

File tree

3 files changed

+88
-135
lines changed

3 files changed

+88
-135
lines changed

rule_schema_v1.yaml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,18 @@
1-
# Specification of the Semgrep rule YAML syntax, using JSON schema to specify the syntax.
1+
# Specification of the Semgrep rule YAML syntax using JSON schema.
2+
#
3+
# Note that even if most Semgrep users use YAML to write a rule, and not JSON,
4+
# we still use a JSON tool (jsonschema) to specify the rule schema because
5+
# YAML is a superset of JSON and can be mechanically translated into JSON;
6+
# there is no yamlschema (see https://json-schema-everywhere.github.io/yaml).
7+
# To add even more confusion, a jsonschema can actually be specified using
8+
# YAML (like in in this file), and so one can use YAML syntax to specify the
9+
# JSON schema of files actually written in YAML (hmmm).
10+
211
$id: https://raw.githubusercontent.com/returntocorp/semgrep-interfaces/main/rule_schema_v1.yaml
312
$schema: http://json-schema.org/draft-07/schema#
4-
#!!If you modify this file, you need to update the submodule in returntocorp/semgrep
5-
# and returntocorp/semgrep-app!!
6-
#!!If you add new syntax to this file, you probably need to add some EXPERIMENTAL
7-
# comment before!!
13+
#!!If you modify this file, update the submodules in semgrep and semgrep-app!!
14+
#!!If you add new syntax to this file, you probably need to add some
15+
# EXPERIMENTAL comment before!!
816
$defs:
917
# EXPERIMENTAL
1018
validator:

rule_schema_v2.atd

Lines changed: 63 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,7 @@
1414
* we still use a JSON tool (here ATD, but also jsonschema) to specify
1515
* the rule schema because YAML is a superset of JSON and can be
1616
* mechanically translated into JSON; there is no yamlschema
17-
* (see https://json-schema-everywhere.github.io/yaml). To add even more
18-
* confusion, a jsonschema can actually be specified using YAML (like in
19-
* rule_shema_v1.yml), and so one can use YAML syntax to specify the
20-
* JSON schema of files actually written in YAML (hmmm).
17+
* (see https://json-schema-everywhere.github.io/yaml).
2118
*
2219
* Jsonschema is powerful but also arguably complicated and so it
2320
* might be simpler for many Semgrep developers (and also some Semgrep
@@ -30,12 +27,18 @@
3027
* Note that this file does not replace Parse_rule.ml nor Rule.ml. We still
3128
* want to accept the old syntax in Parse_rule.ml and also parse with
3229
* position information and error recovery which ATD does not provide.
30+
* This files does not replace either (yet) rule_schema_v1.yml which is
31+
* more complete.
3332
*
3433
* TODO:
34+
* - taint
35+
* - extract
3536
* - r2c-internal-project-depends-on-content
3637
* - secrets
37-
* - generalized taint
3838
* - steps (and join?)
39+
* - generalized taint
40+
* - new metavariable types
41+
* - new 'anywhere:'
3942
*)
4043

4144
(*****************************************************************************)
@@ -51,7 +54,6 @@ type glob = string
5154
(* ex: "[a-zA-Z_]*\\.c" *)
5255
type regex = string
5356

54-
5557
(*****************************************************************************)
5658
(* The rule *)
5759
(*****************************************************************************)
@@ -65,14 +67,13 @@ type rule = {
6567
(* TODO: selector vs analyzer *)
6668
languages: language list;
6769

68-
(* at least one of those must be set *)
70+
(* CHECK: exactly one of those fields must be set *)
6971
?match_ <json name="match">: formula option;
7072
?taint: taint_spec option;
7173
?extract: extract option;
7274
(* TODO: join, steps, secrets, sca *)
7375

74-
~mode <ocaml default="`Search">: mode;
75-
(* TODO: product: product *)
76+
(* TODO? product: product; *)
7677

7778
(* TODO? could be replaced by a pattern-filename: *)
7879
?paths: paths option;
@@ -81,7 +82,7 @@ type rule = {
8182
?fix_regex: fix_regex option;
8283

8384
?metadata: raw_json option;
84-
?options: options option;
85+
?options: rule_options option;
8586

8687
?version: version option;
8788
?min_version: version option;
@@ -96,15 +97,6 @@ type rule_id = string wrap <ocaml module="Rule_ID">
9697
(* Version_info.t *)
9798
type version = string (* TODO wrap <ocaml module="ATDStringWrap.Version"> *)
9899

99-
type mode = [
100-
| Search <json name="search">
101-
| Taint <json name="taint">
102-
| Join <json name="join">
103-
| Extract <json name="extract">
104-
| SemgrepInternalPostprocessor <json name="semgrep_internal_postprocessor">
105-
(* TODO: Steps, SCA? *)
106-
]
107-
108100
(*****************************************************************************)
109101
(* Types of rule fields *)
110102
(*****************************************************************************)
@@ -189,87 +181,49 @@ type fix_regex = {
189181
?count: int option;
190182
}
191183

192-
type options <ocaml from="Rule_options" t="t"> = abstract
184+
type rule_options <ocaml from="Rule_options" t="t"> = abstract
193185

194186
(*****************************************************************************)
195-
(* Search mode (default) and formula *)
187+
(* Formula *)
196188
(*****************************************************************************)
197189

198190
(* 'formula' below is handled by a <json adapter.ocaml=...> because there is no
199191
* way to encode directly using ATD the way we chose to represent formulas
200-
* in YAML/JSON. Indeed, because Yaml/JSON does not support Algebraic data
201-
* types (ADTs), we used a weird encoding abusing objects to represent
202-
* formulas, e.g.,
203-
*
204-
* any:
205-
* - and:
206-
* - "foo"
207-
* - pattern: "bar"
208-
* where:
209-
* - bla
210-
*
211-
* which when turned into JSON gives:
212-
*
213-
* { any: [
214-
* { and: [
215-
* "foo",
216-
* {pattern: "bar" }
217-
* ],
218-
* where: [ bla ],
219-
* }
220-
* ]
221-
* }
222-
*
223-
* The ATD way would be to encode a formula as
192+
* in YAML/JSON.
224193
*
225-
* {f: ["Any", [
226-
* {f: ["And", [
227-
* {f: ["Pattern", "foo"]},
228-
* {f: ["Pattern", "bar"]},
229-
* ]],
230-
* where: [bla]
231-
* }
232-
* ]]
233-
* }
194+
* alt: instead of using those ?all, ?regex, and CHECK:, we could use a
195+
* proper variant, but that would require a more complex adapter and the
196+
* distance between the spec and the actual syntax would be even longer.
234197
*
235-
* So we need rule_schema_v2_adapter.ml used below to transform the first JSON
236-
* in the second dynamically at parsing time, so then ATD can parse it
237-
* using the formula type specified below.
238-
*)
239-
198+
* old: this type was called new-pattern in rule_schema_v1.yaml
199+
*)
240200
type formula = {
241-
(* alt: have ?all: ... ?any: ... ?regex: ... ?pattern: ... with a check
242-
* at parsing time that only one of those fields is given.
243-
*)
244-
f: formula_bis;
245-
(* alt: we could instead do 'All of formula list * condition list' below
246-
* but syntactically we also allow 'where' with pattern:, regex:, etc.
247-
* as in
248-
* { pattern: ..., where: ..., }
249-
* Even though internally in Rule.ml a { pattern: X, where: Y}
250-
* is transformed in an All [pattern: X, Y], in the syntax
251-
* we allow it in more places.
201+
(* CHECK: exactly one of those fields must be set *)
202+
(* either directly a string or pattern: string in the JSON *)
203+
?pattern: string option;
204+
?regex: regex option;
205+
?all: formula list option;
206+
?any: formula list option;
207+
(* CHECK: not/inside/anywhere can appear only inside an all: *)
208+
?not: formula option;
209+
?inside: formula option;
210+
?anywhere: formula option;
211+
(* TODO? ?taint: taint_spec *)
212+
213+
(* alt: we could instead do '?all: formula list option * condition list'
214+
* above, but syntactically we also allow 'where' with pattern:, regex:,
215+
* etc. as in:
216+
*
217+
* - pattern: "foo($X)"
218+
* where: ...
219+
*
220+
* In fact that's the main reason we sometimes have to use pattern: string
221+
* instead of a string because where: could not be attached to a string.
252222
*)
253223
~where: condition list;
254224
}
255225
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
256226

257-
(* old: this type was called new-pattern in rule_schema_v1.yaml but formula in
258-
* Rule.ml
259-
*)
260-
type formula_bis = [
261-
(* either directly a string or pattern: string in the JSON *)
262-
| Pattern <json name="pattern"> of string
263-
| Regex <json name="regex"> of regex
264-
(* 'And of conjunction' in Rule.ml *)
265-
| All <json name="all"> of formula list
266-
| Any <json name="any"> of formula list
267-
(* Not and Inside can appear only inside an All *)
268-
| Not <json name="not"> of formula
269-
| Inside <json name="inside"> of formula
270-
(* TODO? Taint of taint_spec *)
271-
]
272-
273227
(* Just like for formula, we're using an adapter to transform
274228
* conditions in YAML like:
275229
*
@@ -288,7 +242,7 @@ type formula_bis = [
288242
* which we must transform in an ATD-compliant:
289243
*
290244
* [ ["M", [{ metavariable: $X,
291-
* c: ["regex", $Z]
245+
* regex: $Z
292246
* }]
293247
* ]]
294248
*)
@@ -299,10 +253,10 @@ type condition = [
299253
]
300254
<json adapter.ocaml="Rule_schema_v2_adapter.Condition">
301255

302-
(* either a single string or an array in JSON, that is
303-
* {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
304-
*)
305256
type focus = {
257+
(* either a single string or an array in JSON, that is
258+
* {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
259+
*)
306260
focus: mvar list;
307261
}
308262

@@ -316,39 +270,43 @@ type comparison = {
316270

317271
type metavariable_cond = {
318272
metavariable: mvar;
319-
(* alt: have ?type: ... ?types:... ?regex: ... *)
320-
c: metavariable_cond_bis;
321-
}
322273

323-
type metavariable_cond_bis = [
324-
| Type <json name="type"> of string
325-
| Types <json name="types"> of string list
326-
(* alt: we could remove Regex as Formula itself as a Regex
274+
(* CHECK: exactly one of those fields must be set *)
275+
?type_ <json name="type">: string option;
276+
?types: string list option;
277+
(* this covers regex:, pattern:, but also any formula.
327278
* TODO: for metavariable-regex, can also enable constant_propagation
279+
* TOOD: we should accept also language: string
328280
*)
329-
| Regex <json name="regex"> of regex
330-
(* TODO: accept also language: string *)
331-
| Formula <json name="F"> of formula
332-
| Analyzer <json name="analyzer"> of analyzer
333-
]
281+
inherit formula;
282+
?analyzer: analyzer option;
283+
}
334284

335285
type analyzer = [
336286
| Entropy <json name="entropy">
337287
| Redos <json name="redos">
338288
]
339289

340290
(*****************************************************************************)
341-
(* Taint mode *)
291+
(* TODO: Tainting *)
342292
(*****************************************************************************)
343293

344294
type taint_spec = raw_json
345295

346296
(*****************************************************************************)
347-
(* Extract mode *)
297+
(* TODO: SSC *)
298+
(*****************************************************************************)
299+
300+
(*****************************************************************************)
301+
(* TODO: Extract mode *)
348302
(*****************************************************************************)
349303

350304
type extract = raw_json
351305

306+
(*****************************************************************************)
307+
(* TODO: Secrets *)
308+
(*****************************************************************************)
309+
352310
(*****************************************************************************)
353311
(* Toplevel *)
354312
(*****************************************************************************)
@@ -357,7 +315,7 @@ type rules = {
357315
rules: rule list;
358316

359317
(* Missed count of pro rules when not logged-in.
360-
* Sent by the registry to the CLI since 1.48.
318+
* Sent by the registry to the CLI since 1.49.
361319
* See https://github.com/semgrep/semgrep-app/pull/11142
362320
*)
363321
?missed: int option;

rule_schema_v2_adapter.ml

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,8 @@ module Formula = struct
88
let normalize (orig : Yojson.Safe.t ) : Yojson.Safe.t =
99
match orig with
1010
| `String str ->
11-
`Assoc ["f", `List [`String "pattern"; `String str]]
12-
| `Assoc [(key , elt)] ->
13-
`Assoc ["f", `List [`String key; elt]]
14-
| `Assoc [(key , elt); ("where", stuff)] ->
15-
`Assoc [
16-
("f", `List [`String key; elt]);
17-
("where", stuff)]
11+
`Assoc ["pattern", `String str]
12+
(* TODO: check at least one of any/all/... is specified *)
1813
| x -> x
1914

2015
(** Convert from ATD-compatible json to original json *)
@@ -31,25 +26,17 @@ module Condition = struct
3126
| `Assoc (("comparison", cmp)::rest) ->
3227
`List [`String "C";
3328
`Assoc (("comparison", cmp)::rest)]
34-
| `Assoc [("metavariable", mvar); ("regex", reg)] ->
29+
| `Assoc (("metavariable", mvar)::rest) ->
30+
(* TODO: check at least one of type/types/... is specified *)
3531
`List [`String "M";
36-
`Assoc [("metavariable", mvar);
37-
("c", `List [`String "regex"; reg])]]
38-
| `Assoc [("metavariable", mvar); ("type", ty)] ->
39-
`List [`String "M";
40-
`Assoc [("metavariable", mvar);
41-
("c", `List [`String "type"; ty])]]
42-
| `Assoc [("metavariable", mvar); ("types", tys)] ->
43-
`List [`String "M";
44-
`Assoc [("metavariable", mvar);
45-
("c", `List [`String "types"; tys])]]
46-
| `Assoc [("metavariable", mvar); ("pattern", p)] ->
47-
`List [`String "M";
48-
`Assoc [("metavariable", mvar);
49-
("c", `List [`String "F"; `Assoc [("pattern", p)]])]]
50-
| x ->
51-
x
52-
32+
`Assoc (("metavariable", mvar)::rest)]
33+
| `Assoc [("focus", `String x)] ->
34+
`List [`String "F";
35+
`Assoc [("focus", `List [`String x])]]
36+
| `Assoc [("focus", `List x)] ->
37+
`List [`String "F";
38+
`Assoc [("focus", `List x)]]
39+
| x -> x
5340

5441
(** Convert from ATD-compatible json to original json *)
5542
let restore (_atd : Yojson.Safe.t) : Yojson.Safe.t =

0 commit comments

Comments
 (0)