1
1
(* New Semgrep syntax (hence the v2) specified using ATD instead of jsonschema.
2
2
*
3
- * For more information on the new syntax, see:
4
- * - Brandon's community Slack post announcing the new syntax
5
- * https://semgrep.slack.com/archives/C018NJRRCJ0/p1698430726062769?thread_ts=1698350734.415849&cid=C018NJRRCJ0
6
- * - Brandon's slides
7
- * https://docs.google.com/presentation/d/1zzmyFbfNlJqweyzuuFlo4zpSs3Gqhfi6FiNRONSEQ0E/edit#slide=id.g1eee710cdbf_0_26
8
- * - Pieter's video
9
- * https://www.youtube.com/watch?v=dZUPjFvknnI
10
- * - Parsia's blog post
11
- * https://parsiya.net/blog/2023-10-28-semgreps-experimental-rule-syntax/
12
- *
13
- * Note that even if most Semgrep users use YAML to write a rule, and not JSON,
3
+ * Note that even if most Semgrep users use YAML to write rules, and not JSON,
14
4
* we still use a JSON tool (here ATD, but also jsonschema) to specify
15
- * the rule schema because YAML is a superset of JSON and can be
5
+ * the rule schema because YAML is a superset of JSON that can be
16
6
* mechanically translated into JSON; there is no yamlschema
17
7
* (see https://json-schema-everywhere.github.io/yaml).
18
8
*
31
21
* also the old syntax.
32
22
*
33
23
* TODO:
34
- * - secrets
35
24
* - steps (but not join)
36
- * - new metavariable types
37
25
* - generalized taint?
38
26
*
39
27
* related documents:
@@ -55,6 +43,9 @@ type glob = string
55
43
(* ex: "[a-zA-Z_]*\\.c" *)
56
44
type regex = string
57
45
46
+ (* ex: https://www.google.com *)
47
+ type url = string
48
+
58
49
(*****************************************************************************)
59
50
(* The rule *)
60
51
(*****************************************************************************)
@@ -65,26 +56,28 @@ type rule = {
65
56
message: string;
66
57
severity: severity;
67
58
68
- (* later: selector vs analyzer of Martin *)
59
+ (* later: selectors vs analyzer of Martin *)
69
60
languages: language list;
70
61
71
62
(* CHECK: exactly one of those fields must be set *)
72
63
?match_ <json name="match">: formula option;
73
64
?taint: taint option;
74
- (* TODO: steps:, secrets: *)
65
+ (* TODO: steps: *)
75
66
76
- (* work with match: (and in theory also with taint: ) *)
67
+ (* CHECK: those fields work with match: (in theory also with taint: ) *)
68
+ (* supply chain rules *)
77
69
?project_depends_on <json name="r2c-internal-project-depends-on">:
78
70
project_depends_on option;
79
- (* work with match: (and in theory also with taint: )
80
- *
71
+ (* extract rules, a.k.a. preprocessor rules
81
72
* alt: message:/severity: could be made optional when extract: is set,
82
73
* but it's annoying to change those types just for extract. Moreover,
83
- * users can easily put severity: INFO and a fake message:,
74
+ * users can easily put ' severity: INFO' and a fake message:,
84
75
* and at least they can easily test the matching part of the rule
85
76
* by removing the extract and run it like a regular rule.
86
77
*)
87
78
?extract: extract option;
79
+ (* secrets, a.k.a. postprocessor rules *)
80
+ ?validators: validator list option;
88
81
89
82
(* alt: later: could be replaced by a 'filename:' in formula *)
90
83
?paths: paths option;
@@ -96,7 +89,7 @@ type rule = {
96
89
?options: rule_options option;
97
90
98
91
(* TODO? impose more constraints on metadata? standard fields?
99
- * TODO? add also a product: product; ?
92
+ * confidence? product?
100
93
*)
101
94
?metadata: raw_json option;
102
95
@@ -111,7 +104,7 @@ type rule_id = string wrap <ocaml module="Rule_ID">
111
104
type version = string
112
105
113
106
(*****************************************************************************)
114
- (* Types of rule fields *)
107
+ (* Severity, language, paths, fix_regex, rule_options *)
115
108
(*****************************************************************************)
116
109
117
110
(* coupling: semgrep_output_v1.atd with match_severity
@@ -183,7 +176,7 @@ type language = [
183
176
]
184
177
185
178
type paths = {
186
- (* CHECK: at least one of this field is set *)
179
+ (* CHECK: at least one of those fields must be set *)
187
180
?include_ <json name="include">: glob list option;
188
181
?exclude_ <json name="exclude">: glob list option;
189
182
}
@@ -196,7 +189,7 @@ type fix_regex = {
196
189
197
190
(* coupling: Rule_options.atd
198
191
* alt: <ocaml from="Rule_options" t="t"> but I prefer to repeat
199
- * its content here so one can fully see the syntax for a rule in one file.
192
+ * its content here so one can fully see the syntax of a rule in one file.
200
193
*)
201
194
type rule_options = {
202
195
?constant_propagation: bool option;
@@ -256,7 +249,19 @@ type generic_comment_style = [
256
249
(* Formula *)
257
250
(*****************************************************************************)
258
251
259
- (* 'formula' below is handled by a <json adapter.ocaml=...> because there is no
252
+ (* For more information on the new syntax for patterns, see:
253
+ * - Brandon's community Slack post announcing the new syntax
254
+ * https://semgrep.slack.com/archives/C018NJRRCJ0/p1698430726062769?thread_ts=1698350734.415849&cid=C018NJRRCJ0
255
+ * https://www.notion.so/semgrep/New-Rule-Syntax-Summary-f0bc252585f944a7b430294a88ae83a2
256
+ * https://www.notion.so/semgrep/Rule-Syntax-2-0-cf8fdaf20992472881b64b6db188a78b
257
+ * - Brandon's slides
258
+ * https://docs.google.com/presentation/d/1zzmyFbfNlJqweyzuuFlo4zpSs3Gqhfi6FiNRONSEQ0E/edit#slide=id.g1eee710cdbf_0_26
259
+ * - Pieter's video
260
+ * https://www.youtube.com/watch?v=dZUPjFvknnI
261
+ * - Parsia's blog post
262
+ * https://parsiya.net/blog/2023-10-28-semgreps-experimental-rule-syntax/
263
+ *
264
+ * 'formula' below is handled by a <json adapter.ocaml=...> because there is no
260
265
* way to encode directly using ATD the way we chose to represent formulas
261
266
* in YAML/JSON.
262
267
*
@@ -279,52 +284,42 @@ type generic_comment_style = [
279
284
* CHECK: not/inside/anywhere can appear only inside an all:
280
285
*)
281
286
type formula = {
282
- (* either directly a string or pattern: string in the JSON *)
287
+ (* either directly a string or ' pattern: string' in the JSON *)
283
288
?pattern: pattern option;
284
- (* regex can also be entered with pattern: xxx when languages: [regex] *)
289
+ (* regex can also be entered with ' pattern: xxx' when languages: [regex] *)
285
290
?regex: regex option;
286
291
287
- (* Boolean opeators. alt: we could have chosen and: and or: *)
292
+ (* Boolean opeators. alt: we could have chosen and/or instead of all/any *)
288
293
?all: formula list option;
289
294
?any: formula list option;
290
295
?not: formula option;
291
296
292
297
(* later: we should remove with a better range logic *)
293
298
?inside: formula option;
294
- (* NEW: since 1.49. alt: in condition instead as in 'where: - also: ...' *)
299
+ (* NEW: since 1.49. alt: in condition instead like 'where: - also: ...' *)
295
300
?anywhere: formula option;
296
- (* TODO? ?taint: taint *)
301
+ (* TODO? ?taint: taint option; and ?steps: ? *)
297
302
298
303
?where: condition list option;
299
304
}
300
305
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">
301
306
302
307
(* This string must be a valid Semgrep pattern for the first language
303
- * specified in the languages: list in the rule.
308
+ * specified in the ' languages:' list in the rule.
304
309
*)
305
310
type pattern = string
306
311
307
312
(* Just like for formula, we're using an adapter to transform
308
313
* conditions in YAML like:
309
- *
310
314
* where:
311
315
* - metavariable: $X
312
316
* regex: $Z
313
- *
314
317
* which when turned into JSON gives:
315
- *
316
- * { where: [
317
- * { metavariable: $X,
318
- * regex: $Z
319
- * }
320
- * ] }
321
- *
318
+ * { where:
319
+ * [ { metavariable: $X, regex: $Z } ]
320
+ * }
322
321
* which we must transform in an ATD-compliant:
323
- *
324
- * [ ["M", [{ metavariable: $X,
325
- * regex: $Z
326
- * }]
327
- * ]]
322
+ * ["M", [{ metavariable: $X, regex: $Z }]]
328
323
*)
329
324
type condition = [
330
325
| Focus <json name="F"> of focus
@@ -333,26 +328,36 @@ type condition = [
333
328
]
334
329
<json adapter.ocaml="Rule_schema_v2_adapter.Condition">
335
330
331
+ (* --------------------------- *)
332
+ (* Focus condition *)
333
+ (* --------------------------- *)
334
+
336
335
type focus = {
337
- (* either a single string or an array in JSON, that is
338
- * {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
339
- *)
336
+ (* either directly a string or a list of strings in the JSON *)
340
337
focus: mvar list;
341
338
}
342
339
343
340
type mvar = string
344
341
345
- (* comparison expression with metavariables (currently using a Python-like
346
- * syntax), ex: $X > 100
347
- *)
348
- type comparison_expr = string
342
+ (* --------------------------- *)
343
+ (* Comparison condition *)
344
+ (* --------------------------- *)
349
345
350
346
type comparison = {
351
347
comparison: comparison_expr;
352
348
?base: int option;
353
349
?strip: bool option;
354
350
}
355
351
352
+ (* comparison expression with metavariables, ex: $X > 100
353
+ * (currently using a Python-like syntax)
354
+ *)
355
+ type comparison_expr = string
356
+
357
+ (* --------------------------- *)
358
+ (* Metavariable condition *)
359
+ (* --------------------------- *)
360
+
356
361
type metavariable_cond = {
357
362
metavariable: mvar;
358
363
@@ -422,7 +427,6 @@ type by_side_effect = [
422
427
]
423
428
<json adapter.ocaml="Rule_schema_v2_adapter.BySideEffect">
424
429
425
-
426
430
(* --------------------------- *)
427
431
(* Source *)
428
432
(* --------------------------- *)
@@ -514,8 +518,10 @@ type semver_range = string
514
518
(* Extract *)
515
519
(*****************************************************************************)
516
520
521
+ (* TODO: this syntax is actually not even supported yet in Parse_rule.ml *)
517
522
type extract = {
518
523
metavariable: mvar;
524
+
519
525
?dest_language <json name="dest-language">: language option;
520
526
?dest_rules <json name="dest-rules">: dest_rules option;
521
527
(* map-reduce! *)
@@ -524,7 +530,7 @@ type extract = {
524
530
}
525
531
526
532
type dest_rules = {
527
- (* CHECK: at least one of those options is set *)
533
+ (* CHECK: at least one of those options must be set *)
528
534
?exclude_ <json name="exclude">: rule_id list option;
529
535
?include_ <json name="include">: rule_id list option;
530
536
}
@@ -540,9 +546,91 @@ type extract_transform = [
540
546
| ConcatJsonStringArray <json name="concat_json_string_array">
541
547
]
542
548
(*****************************************************************************)
543
- (* TODO: Secrets *)
549
+ (* Secrets *)
544
550
(*****************************************************************************)
545
551
552
+ (* See https://www.notion.so/semgrep/Postprocessor-Syntax-v1-0-b1481ce32ab8454a8066a1e767cd870a *)
553
+ type validator = {
554
+ http: http_validator;
555
+ (* LATER: ?ftp:, ?imap:, ... *)
556
+ }
557
+
558
+ type http_validator = {
559
+ request: http_request;
560
+ response: http_response_matcher list;
561
+ }
562
+
563
+ type headers = (string * header_pattern) list
564
+ <json repr="object">
565
+
566
+ (* can contain metavariables, ex: 'Bearer $X' *)
567
+ type header_pattern = string
568
+
569
+ (* --------------------------- *)
570
+ (* Request *)
571
+ (* --------------------------- *)
572
+
573
+ type http_request = {
574
+ url: url;
575
+ method_ <json name="method">: http_method;
576
+ headers: headers;
577
+ ?auth: auth option;
578
+ ?body: string option;
579
+ }
580
+
581
+ type http_method = [
582
+ | GET
583
+ | POST
584
+ | DELETE
585
+ | HEAD
586
+ | PUT
587
+ ]
588
+
589
+ (* TODO? type_ <json name="type">: auth_kind; *)
590
+ type auth = raw_json
591
+
592
+ (* --------------------------- *)
593
+ (* Response *)
594
+ (* --------------------------- *)
595
+
596
+ (* alt: could have shortcuts like: 'Valid(status=[200, 403])', like we do
597
+ * in our jsonnet secret rules, instead of the currently more verbose
598
+ * { match: [{ status_code: 200}, {status_code: 403}], result: valid }
599
+ * but how we get a valid JSON syntax for 'Valid(status=[200, 403])'?
600
+ *)
601
+ type http_response_matcher = {
602
+ match_ <json name="match">: match_ list;
603
+ result: result;
604
+ }
605
+
606
+ type match_ = {
607
+ (* CHECK at least one of status-code:/headers:/content: must be set *)
608
+ ?status_code <json name="status-code">: int option;
609
+ (* note that this time it's a list of headers! *)
610
+ ?headers: headers list option;
611
+ ?content: content option;
612
+ }
613
+
614
+ type content = {
615
+ inherit formula;
616
+ ?language: language option;
617
+ }
618
+ <json adapter.ocaml="Rule_schema_v2_adapter.Formula">
619
+
620
+ (* STRICTER: note that we are more complete than rule_schema_v1.yml here *)
621
+ type result = {
622
+ validity: validity;
623
+ (* overriding the rule fields *)
624
+ ?severity: severity option;
625
+ ?metadata: raw_json option;
626
+ ?message: string option;
627
+ }
628
+
629
+ type validity = [
630
+ | Valid <json name="valid">
631
+ | Invalid <json name="invalid">
632
+ ]
633
+
546
634
(*****************************************************************************)
547
635
(* Toplevel *)
548
636
(*****************************************************************************)
0 commit comments