Skip to content

Commit dd89d3c

Browse files
authored
Specify taint syntax in rule_syntax_v2.atd (#188)
This is actually more precise than rule_schema_v1.yaml! test plan: see related PR in semgrep - [x] I ran `make setup && make` to update the generated code after editing a `.atd` file (TODO: have a CI check) - [x] I made sure we're still backward compatible with old versions of the CLI. For example, the Semgrep backend need to still be able to *consume* data generated by Semgrep 1.17.0. See https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades
1 parent e655eef commit dd89d3c

File tree

2 files changed

+159
-38
lines changed

2 files changed

+159
-38
lines changed

rule_schema_v2.atd

Lines changed: 144 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,21 @@
2727
* Note that this file does not replace Parse_rule.ml nor Rule.ml. We still
2828
* want to accept the old syntax in Parse_rule.ml and also parse with
2929
* position information and error recovery which ATD does not provide.
30-
* This files does not replace either (yet) rule_schema_v1.yml which is
31-
* more complete.
30+
* This files does not replace either (yet) rule_schema_v1.yml which covers
31+
* also the old syntax.
3232
*
3333
* TODO:
34-
* - taint
3534
* - extract
3635
* - r2c-internal-project-depends-on-content
3736
* - secrets
38-
* - steps (and join?)
39-
* - generalized taint
37+
* - steps (but not join)
4038
* - new metavariable types
41-
* - new 'anywhere:'
39+
* - generalized taint?
40+
*
41+
* related documents:
42+
* - rule_schema_v1.yaml (actually less complete for the new syntax now)
43+
* - Parse_rule.ml (the final source of truth, except for stuff currently
44+
* handled only in pysemgrep such as join-mode or ssc)
4245
*)
4346

4447
(*****************************************************************************)
@@ -69,13 +72,12 @@ type rule = {
6972

7073
(* CHECK: exactly one of those fields must be set *)
7174
?match_ <json name="match">: formula option;
72-
?taint: taint_spec option;
75+
?taint: taint option;
7376
?extract: extract option;
74-
(* TODO: join, steps, secrets, sca *)
75-
77+
(* TODO: steps, secrets, sca *)
7678
(* TODO? product: product; *)
7779

78-
(* TODO? could be replaced by a pattern-filename: *)
80+
(* alt: later: could be replaced by a pattern-filename: *)
7981
?paths: paths option;
8082

8183
?fix: string option;
@@ -95,20 +97,19 @@ type rule = {
9597
type rule_id = string wrap <ocaml module="Rule_ID">
9698

9799
(* Version_info.t *)
98-
type version = string (* TODO wrap <ocaml module="ATDStringWrap.Version"> *)
100+
type version = string
99101

100102
(*****************************************************************************)
101103
(* Types of rule fields *)
102104
(*****************************************************************************)
103105

104-
(* coupling: semgrep_output_v1.atd with match_severity *)
106+
(* coupling: semgrep_output_v1.atd with match_severity
107+
* I've removed EXPERIMENT and INVENTORY which should not be used.
108+
*)
105109
type severity = [
106110
| Error <json name="ERROR">
107111
| Warning <json name="WARNING">
108112
| Info <json name="INFO">
109-
(* should not be used *)
110-
| Experiment <json name="EXPERIMENT">
111-
| Inventory <json name="INVENTORY">
112113
]
113114

114115
(* coupling: language.ml *)
@@ -171,8 +172,9 @@ type language = [
171172
]
172173

173174
type paths = {
174-
~include_ <json name="include">: glob list;
175-
~exclude: glob list;
175+
(* CHECK: at least one of this field is set *)
176+
?include_ <json name="include">: glob list option;
177+
?exclude_ <json name="exclude">: glob list option;
176178
}
177179

178180
type fix_regex = {
@@ -195,35 +197,46 @@ type rule_options <ocaml from="Rule_options" t="t"> = abstract
195197
* proper variant, but that would require a more complex adapter and the
196198
* distance between the spec and the actual syntax would be even longer.
197199
*
200+
* alt: we could instead do '?all: formula list option * condition list'
201+
* below, but syntactically we also allow 'where' with pattern:, regex:,
202+
* etc. as in:
203+
* - pattern: "foo($X)"
204+
* where: ...
205+
* In fact that's the main reason we sometimes have to use pattern: string
206+
* instead of a string because where: could not be attached to a string.
207+
*
198208
* old: this type was called new-pattern in rule_schema_v1.yaml
199-
*)
209+
*
210+
* CHECK: exactly one of pattern/regex/all/any/not/inside/anywhere field
211+
* must be set
212+
* CHECK: not/inside/anywhere can appear only inside an all:
213+
*)
200214
type formula = {
201-
(* CHECK: exactly one of those fields must be set *)
202215
(* either directly a string or pattern: string in the JSON *)
203-
?pattern: string option;
216+
?pattern: pattern option;
217+
(* regex can also be entered with pattern: xxx when languages: [regex] *)
204218
?regex: regex option;
219+
220+
(* Boolean opeators. alt: we could have chosen and: and or: *)
205221
?all: formula list option;
206222
?any: formula list option;
207-
(* CHECK: not/inside/anywhere can appear only inside an all: *)
208223
?not: formula option;
224+
225+
(* later: we should remove with a better range logic *)
209226
?inside: formula option;
227+
(* NEW: since 1.49 *)
210228
?anywhere: formula option;
211-
(* TODO? ?taint: taint_spec *)
212-
213-
(* alt: we could instead do '?all: formula list option * condition list'
214-
* above, but syntactically we also allow 'where' with pattern:, regex:,
215-
* etc. as in:
216-
*
217-
* - pattern: "foo($X)"
218-
* where: ...
219-
*
220-
* In fact that's the main reason we sometimes have to use pattern: string
221-
* instead of a string because where: could not be attached to a string.
222-
*)
223-
~where: condition list;
229+
(* TODO? ?taint: taint *)
230+
231+
?where: condition list option;
224232
}
225233
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
226234

235+
(* This string must be a valid Semgrep pattern for the first language
236+
* specified in the languages: list in the rule.
237+
*)
238+
type pattern = string
239+
227240
(* Just like for formula, we're using an adapter to transform
228241
* conditions in YAML like:
229242
*
@@ -262,10 +275,15 @@ type focus = {
262275

263276
type mvar = string
264277

278+
(* comparison expression with metavariables (currently using a Python-like
279+
* syntax), ex: $X > 100
280+
*)
281+
type comparison_expr = string
282+
265283
type comparison = {
266-
comparison: string; (* expr *)
284+
comparison: comparison_expr;
267285
?base: int option;
268-
~strip: bool;
286+
?strip: bool option;
269287
}
270288

271289
type metavariable_cond = {
@@ -288,10 +306,98 @@ type analyzer = [
288306
]
289307

290308
(*****************************************************************************)
291-
(* TODO: Tainting *)
309+
(* Tainting *)
292310
(*****************************************************************************)
293311

294-
type taint_spec = raw_json
312+
(* STRICTER: actually rule_schema_v1.yaml has very loose definitions for
313+
* tainting stuff. Even requires: label: are not defined for the
314+
* old syntax, and for the new syntax many fields are still missing
315+
* in rule_schema_v1.yaml
316+
*)
317+
type taint = {
318+
sources: source list;
319+
sinks: sink list;
320+
?sanitizers: sanitizer list option;
321+
?propagators: propagator list option;
322+
}
323+
324+
(* --------------------------- *)
325+
(* Some taint options *)
326+
(* --------------------------- *)
327+
328+
type label_options = {
329+
?label: label option;
330+
?requires: requires_expr option;
331+
}
332+
333+
type label = string
334+
335+
(* a boolean expression with labels, ex: "A and B" *)
336+
type requires_expr = string
337+
338+
(* STRICTER: not even specified in rule_schema_v1.yaml *)
339+
type taint_options = {
340+
?by_side_effect <json name="by-side-effect">: by_side_effect option;
341+
?exact: bool option;
342+
}
343+
344+
(* we need an adapter here because we allow boolean or "only" string *)
345+
type by_side_effect = [
346+
| True <json name="true">
347+
| False <json name="false">
348+
| Only <json name="only">
349+
]
350+
<json adapter.ocaml="Rule_schema_v2_adapter.BySideEffect">
351+
352+
353+
(* --------------------------- *)
354+
(* Source *)
355+
(* --------------------------- *)
356+
357+
(* need to repeat the adapter below for the str -> pattern: str adaptation *)
358+
type source = {
359+
inherit formula;
360+
inherit label_options;
361+
inherit taint_options;
362+
?control: bool option;
363+
}
364+
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
365+
366+
(* --------------------------- *)
367+
(* Sink *)
368+
(* --------------------------- *)
369+
type sink = {
370+
inherit formula;
371+
(* just requires: here, no label: *)
372+
?requires: string option; (* expr with labels? *)
373+
}
374+
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
375+
376+
(* --------------------------- *)
377+
(* Sanitizer *)
378+
(* --------------------------- *)
379+
380+
type sanitizer = {
381+
inherit formula;
382+
inherit taint_options;
383+
(* TODO: not-conflicting: *)
384+
}
385+
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
386+
387+
(* --------------------------- *)
388+
(* Propagator *)
389+
(* --------------------------- *)
390+
391+
type propagator = {
392+
inherit formula;
393+
from_ <json name="from">: mvar;
394+
to_ <json name="to">: mvar;
395+
inherit label_options;
396+
(* no exact: here, just by-side-effect: *)
397+
?by_side_effect <json name="by-side-effect">: by_side_effect option;
398+
(* TODO? replace-labels? *)
399+
}
400+
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
295401

296402
(*****************************************************************************)
297403
(* TODO: SSC *)

rule_schema_v2_adapter.ml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,18 @@ module Condition = struct
4343
(* not needed for now; we care just about parsing *)
4444
failwith "Rule_schema_v2_adapter.Condition.restore not implemented"
4545
end
46+
47+
module BySideEffect = struct
48+
49+
(** Convert from original json to ATD-compatible json *)
50+
let normalize (orig : Yojson.Safe.t ) : Yojson.Safe.t =
51+
match orig with
52+
| `Bool true -> `String "true"
53+
| `Bool false -> `String "false"
54+
| x -> x
55+
56+
(** Convert from ATD-compatible json to original json *)
57+
let restore (_atd : Yojson.Safe.t) : Yojson.Safe.t =
58+
(* not needed for now; we care just about parsing *)
59+
failwith "Rule_schema_v2_adapter.BySideEffect.restore not implemented"
60+
end

0 commit comments

Comments
 (0)