Skip to content

Commit 473e211

Browse files
authored
Specification of secrets v1 syntax in rule_syntax_v2.atd (#194)
test plan: see test in related semgrep PR - [x] I ran `make setup && make` to update the generated code after editing a `.atd` file (TODO: have a CI check) - [x] I made sure we're still backward compatible with old versions of the CLI. For example, the Semgrep backend need to still be able to *consume* data generated by Semgrep 1.17.0. See https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades
1 parent 9e95398 commit 473e211

File tree

1 file changed

+143
-55
lines changed

1 file changed

+143
-55
lines changed

rule_schema_v2.atd

+143-55
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,8 @@
11
(* New Semgrep syntax (hence the v2) specified using ATD instead of jsonschema.
22
*
3-
* For more information on the new syntax, see:
4-
* - Brandon's community Slack post announcing the new syntax
5-
* https://semgrep.slack.com/archives/C018NJRRCJ0/p1698430726062769?thread_ts=1698350734.415849&cid=C018NJRRCJ0
6-
* - Brandon's slides
7-
* https://docs.google.com/presentation/d/1zzmyFbfNlJqweyzuuFlo4zpSs3Gqhfi6FiNRONSEQ0E/edit#slide=id.g1eee710cdbf_0_26
8-
* - Pieter's video
9-
* https://www.youtube.com/watch?v=dZUPjFvknnI
10-
* - Parsia's blog post
11-
* https://parsiya.net/blog/2023-10-28-semgreps-experimental-rule-syntax/
12-
*
13-
* Note that even if most Semgrep users use YAML to write a rule, and not JSON,
3+
* Note that even if most Semgrep users use YAML to write rules, and not JSON,
144
* we still use a JSON tool (here ATD, but also jsonschema) to specify
15-
* the rule schema because YAML is a superset of JSON and can be
5+
* the rule schema because YAML is a superset of JSON that can be
166
* mechanically translated into JSON; there is no yamlschema
177
* (see https://json-schema-everywhere.github.io/yaml).
188
*
@@ -31,9 +21,7 @@
3121
* also the old syntax.
3222
*
3323
* TODO:
34-
* - secrets
3524
* - steps (but not join)
36-
* - new metavariable types
3725
* - generalized taint?
3826
*
3927
* related documents:
@@ -55,6 +43,9 @@ type glob = string
5543
(* ex: "[a-zA-Z_]*\\.c" *)
5644
type regex = string
5745

46+
(* ex: https://www.google.com *)
47+
type url = string
48+
5849
(*****************************************************************************)
5950
(* The rule *)
6051
(*****************************************************************************)
@@ -65,26 +56,28 @@ type rule = {
6556
message: string;
6657
severity: severity;
6758

68-
(* later: selector vs analyzer of Martin *)
59+
(* later: selectors vs analyzer of Martin *)
6960
languages: language list;
7061

7162
(* CHECK: exactly one of those fields must be set *)
7263
?match_ <json name="match">: formula option;
7364
?taint: taint option;
74-
(* TODO: steps:, secrets: *)
65+
(* TODO: steps: *)
7566

76-
(* work with match: (and in theory also with taint: ) *)
67+
(* CHECK: those fields work with match: (in theory also with taint: ) *)
68+
(* supply chain rules *)
7769
?project_depends_on <json name="r2c-internal-project-depends-on">:
7870
project_depends_on option;
79-
(* work with match: (and in theory also with taint: )
80-
*
71+
(* extract rules, a.k.a. preprocessor rules
8172
* alt: message:/severity: could be made optional when extract: is set,
8273
* but it's annoying to change those types just for extract. Moreover,
83-
* users can easily put severity: INFO and a fake message:,
74+
* users can easily put 'severity: INFO' and a fake message:,
8475
* and at least they can easily test the matching part of the rule
8576
* by removing the extract and run it like a regular rule.
8677
*)
8778
?extract: extract option;
79+
(* secrets, a.k.a. postprocessor rules *)
80+
?validators: validator list option;
8881

8982
(* alt: later: could be replaced by a 'filename:' in formula *)
9083
?paths: paths option;
@@ -96,7 +89,7 @@ type rule = {
9689
?options: rule_options option;
9790

9891
(* TODO? impose more constraints on metadata? standard fields?
99-
* TODO? add also a product: product; ?
92+
* confidence? product?
10093
*)
10194
?metadata: raw_json option;
10295

@@ -111,7 +104,7 @@ type rule_id = string wrap <ocaml module="Rule_ID">
111104
type version = string
112105

113106
(*****************************************************************************)
114-
(* Types of rule fields *)
107+
(* Severity, language, paths, fix_regex, rule_options *)
115108
(*****************************************************************************)
116109

117110
(* coupling: semgrep_output_v1.atd with match_severity
@@ -183,7 +176,7 @@ type language = [
183176
]
184177

185178
type paths = {
186-
(* CHECK: at least one of this field is set *)
179+
(* CHECK: at least one of those fields must be set *)
187180
?include_ <json name="include">: glob list option;
188181
?exclude_ <json name="exclude">: glob list option;
189182
}
@@ -196,7 +189,7 @@ type fix_regex = {
196189

197190
(* coupling: Rule_options.atd
198191
* alt: <ocaml from="Rule_options" t="t"> but I prefer to repeat
199-
* its content here so one can fully see the syntax for a rule in one file.
192+
* its content here so one can fully see the syntax of a rule in one file.
200193
*)
201194
type rule_options = {
202195
?constant_propagation: bool option;
@@ -256,7 +249,19 @@ type generic_comment_style = [
256249
(* Formula *)
257250
(*****************************************************************************)
258251

259-
(* 'formula' below is handled by a <json adapter.ocaml=...> because there is no
252+
(* For more information on the new syntax for patterns, see:
253+
* - Brandon's community Slack post announcing the new syntax
254+
* https://semgrep.slack.com/archives/C018NJRRCJ0/p1698430726062769?thread_ts=1698350734.415849&cid=C018NJRRCJ0
255+
* https://www.notion.so/semgrep/New-Rule-Syntax-Summary-f0bc252585f944a7b430294a88ae83a2
256+
* https://www.notion.so/semgrep/Rule-Syntax-2-0-cf8fdaf20992472881b64b6db188a78b
257+
* - Brandon's slides
258+
* https://docs.google.com/presentation/d/1zzmyFbfNlJqweyzuuFlo4zpSs3Gqhfi6FiNRONSEQ0E/edit#slide=id.g1eee710cdbf_0_26
259+
* - Pieter's video
260+
* https://www.youtube.com/watch?v=dZUPjFvknnI
261+
* - Parsia's blog post
262+
* https://parsiya.net/blog/2023-10-28-semgreps-experimental-rule-syntax/
263+
*
264+
* 'formula' below is handled by a <json adapter.ocaml=...> because there is no
260265
* way to encode directly using ATD the way we chose to represent formulas
261266
* in YAML/JSON.
262267
*
@@ -279,52 +284,42 @@ type generic_comment_style = [
279284
* CHECK: not/inside/anywhere can appear only inside an all:
280285
*)
281286
type formula = {
282-
(* either directly a string or pattern: string in the JSON *)
287+
(* either directly a string or 'pattern: string' in the JSON *)
283288
?pattern: pattern option;
284-
(* regex can also be entered with pattern: xxx when languages: [regex] *)
289+
(* regex can also be entered with 'pattern: xxx' when languages: [regex] *)
285290
?regex: regex option;
286291

287-
(* Boolean opeators. alt: we could have chosen and: and or: *)
292+
(* Boolean opeators. alt: we could have chosen and/or instead of all/any *)
288293
?all: formula list option;
289294
?any: formula list option;
290295
?not: formula option;
291296

292297
(* later: we should remove with a better range logic *)
293298
?inside: formula option;
294-
(* NEW: since 1.49. alt: in condition instead as in 'where: - also: ...' *)
299+
(* NEW: since 1.49. alt: in condition instead like 'where: - also: ...' *)
295300
?anywhere: formula option;
296-
(* TODO? ?taint: taint *)
301+
(* TODO? ?taint: taint option; and ?steps: ? *)
297302

298303
?where: condition list option;
299304
}
300305
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
301306

302307
(* This string must be a valid Semgrep pattern for the first language
303-
* specified in the languages: list in the rule.
308+
* specified in the 'languages:' list in the rule.
304309
*)
305310
type pattern = string
306311

307312
(* Just like for formula, we're using an adapter to transform
308313
* conditions in YAML like:
309-
*
310314
* where:
311315
* - metavariable: $X
312316
* regex: $Z
313-
*
314317
* which when turned into JSON gives:
315-
*
316-
* { where: [
317-
* { metavariable: $X,
318-
* regex: $Z
319-
* }
320-
* ] }
321-
*
318+
* { where:
319+
* [ { metavariable: $X, regex: $Z } ]
320+
* }
322321
* which we must transform in an ATD-compliant:
323-
*
324-
* [ ["M", [{ metavariable: $X,
325-
* regex: $Z
326-
* }]
327-
* ]]
322+
* ["M", [{ metavariable: $X, regex: $Z }]]
328323
*)
329324
type condition = [
330325
| Focus <json name="F"> of focus
@@ -333,26 +328,36 @@ type condition = [
333328
]
334329
<json adapter.ocaml="Rule_schema_v2_adapter.Condition">
335330

331+
(* --------------------------- *)
332+
(* Focus condition *)
333+
(* --------------------------- *)
334+
336335
type focus = {
337-
(* either a single string or an array in JSON, that is
338-
* {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
339-
*)
336+
(* either directly a string or a list of strings in the JSON *)
340337
focus: mvar list;
341338
}
342339

343340
type mvar = string
344341

345-
(* comparison expression with metavariables (currently using a Python-like
346-
* syntax), ex: $X > 100
347-
*)
348-
type comparison_expr = string
342+
(* --------------------------- *)
343+
(* Comparison condition *)
344+
(* --------------------------- *)
349345

350346
type comparison = {
351347
comparison: comparison_expr;
352348
?base: int option;
353349
?strip: bool option;
354350
}
355351

352+
(* comparison expression with metavariables, ex: $X > 100
353+
* (currently using a Python-like syntax)
354+
*)
355+
type comparison_expr = string
356+
357+
(* --------------------------- *)
358+
(* Metavariable condition *)
359+
(* --------------------------- *)
360+
356361
type metavariable_cond = {
357362
metavariable: mvar;
358363

@@ -422,7 +427,6 @@ type by_side_effect = [
422427
]
423428
<json adapter.ocaml="Rule_schema_v2_adapter.BySideEffect">
424429

425-
426430
(* --------------------------- *)
427431
(* Source *)
428432
(* --------------------------- *)
@@ -514,8 +518,10 @@ type semver_range = string
514518
(* Extract *)
515519
(*****************************************************************************)
516520

521+
(* TODO: this syntax is actually not even supported yet in Parse_rule.ml *)
517522
type extract = {
518523
metavariable: mvar;
524+
519525
?dest_language <json name="dest-language">: language option;
520526
?dest_rules <json name="dest-rules">: dest_rules option;
521527
(* map-reduce! *)
@@ -524,7 +530,7 @@ type extract = {
524530
}
525531

526532
type dest_rules = {
527-
(* CHECK: at least one of those options is set *)
533+
(* CHECK: at least one of those options must be set *)
528534
?exclude_ <json name="exclude">: rule_id list option;
529535
?include_ <json name="include">: rule_id list option;
530536
}
@@ -540,9 +546,91 @@ type extract_transform = [
540546
| ConcatJsonStringArray <json name="concat_json_string_array">
541547
]
542548
(*****************************************************************************)
543-
(* TODO: Secrets *)
549+
(* Secrets *)
544550
(*****************************************************************************)
545551

552+
(* See https://www.notion.so/semgrep/Postprocessor-Syntax-v1-0-b1481ce32ab8454a8066a1e767cd870a *)
553+
type validator = {
554+
http: http_validator;
555+
(* LATER: ?ftp:, ?imap:, ... *)
556+
}
557+
558+
type http_validator = {
559+
request: http_request;
560+
response: http_response_matcher list;
561+
}
562+
563+
type headers = (string * header_pattern) list
564+
<json repr="object">
565+
566+
(* can contain metavariables, ex: 'Bearer $X' *)
567+
type header_pattern = string
568+
569+
(* --------------------------- *)
570+
(* Request *)
571+
(* --------------------------- *)
572+
573+
type http_request = {
574+
url: url;
575+
method_ <json name="method">: http_method;
576+
headers: headers;
577+
?auth: auth option;
578+
?body: string option;
579+
}
580+
581+
type http_method = [
582+
| GET
583+
| POST
584+
| DELETE
585+
| HEAD
586+
| PUT
587+
]
588+
589+
(* TODO? type_ <json name="type">: auth_kind; *)
590+
type auth = raw_json
591+
592+
(* --------------------------- *)
593+
(* Response *)
594+
(* --------------------------- *)
595+
596+
(* alt: could have shortcuts like: 'Valid(status=[200, 403])', like we do
597+
* in our jsonnet secret rules, instead of the currently more verbose
598+
* { match: [{ status_code: 200}, {status_code: 403}], result: valid }
599+
* but how we get a valid JSON syntax for 'Valid(status=[200, 403])'?
600+
*)
601+
type http_response_matcher = {
602+
match_ <json name="match">: match_ list;
603+
result: result;
604+
}
605+
606+
type match_ = {
607+
(* CHECK at least one of status-code:/headers:/content: must be set *)
608+
?status_code <json name="status-code">: int option;
609+
(* note that this time it's a list of headers! *)
610+
?headers: headers list option;
611+
?content: content option;
612+
}
613+
614+
type content = {
615+
inherit formula;
616+
?language: language option;
617+
}
618+
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
619+
620+
(* STRICTER: note that we are more complete than rule_schema_v1.yml here *)
621+
type result = {
622+
validity: validity;
623+
(* overriding the rule fields *)
624+
?severity: severity option;
625+
?metadata: raw_json option;
626+
?message: string option;
627+
}
628+
629+
type validity = [
630+
| Valid <json name="valid">
631+
| Invalid <json name="invalid">
632+
]
633+
546634
(*****************************************************************************)
547635
(* Toplevel *)
548636
(*****************************************************************************)

0 commit comments

Comments
 (0)