Skip to content

Commit

Permalink
Simplify rule_schema_v2.atd, use less variant (#187)
Browse files Browse the repository at this point in the history
This better match the syntax.
The goal is not to redo Rule.ml and have clean variant types!
The goal is to specify the syntax.

test plan:
see related PR in semgrep


- [x] I ran `make setup && make` to update the generated code after
editing a `.atd` file (TODO: have a CI check)
- [x] I made sure we're still backward compatible with old versions of
the CLI.
For example, the Semgrep backend need to still be able to *consume* data
generated
	  by Semgrep 1.17.0.
See
https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades
  • Loading branch information
aryx authored Nov 9, 2023
1 parent ad4eb72 commit e655eef
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 135 deletions.
18 changes: 13 additions & 5 deletions rule_schema_v1.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
# Specification of the Semgrep rule YAML syntax, using JSON schema to specify the syntax.
# Specification of the Semgrep rule YAML syntax using JSON schema.
#
# Note that even if most Semgrep users use YAML to write a rule, and not JSON,
# we still use a JSON tool (jsonschema) to specify the rule schema because
# YAML is a superset of JSON and can be mechanically translated into JSON;
# there is no yamlschema (see https://json-schema-everywhere.github.io/yaml).
# To add even more confusion, a jsonschema can actually be specified using
# YAML (like in in this file), and so one can use YAML syntax to specify the
# JSON schema of files actually written in YAML (hmmm).

$id: https://raw.githubusercontent.com/returntocorp/semgrep-interfaces/main/rule_schema_v1.yaml
$schema: http://json-schema.org/draft-07/schema#
#!!If you modify this file, you need to update the submodule in returntocorp/semgrep
# and returntocorp/semgrep-app!!
#!!If you add new syntax to this file, you probably need to add some EXPERIMENTAL
# comment before!!
#!!If you modify this file, update the submodules in semgrep and semgrep-app!!
#!!If you add new syntax to this file, you probably need to add some
# EXPERIMENTAL comment before!!
$defs:
# EXPERIMENTAL
validator:
Expand Down
168 changes: 63 additions & 105 deletions rule_schema_v2.atd
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
* we still use a JSON tool (here ATD, but also jsonschema) to specify
* the rule schema because YAML is a superset of JSON and can be
* mechanically translated into JSON; there is no yamlschema
* (see https://json-schema-everywhere.github.io/yaml). To add even more
* confusion, a jsonschema can actually be specified using YAML (like in
* rule_shema_v1.yml), and so one can use YAML syntax to specify the
* JSON schema of files actually written in YAML (hmmm).
* (see https://json-schema-everywhere.github.io/yaml).
*
* Jsonschema is powerful but also arguably complicated and so it
* might be simpler for many Semgrep developers (and also some Semgrep
Expand All @@ -30,12 +27,18 @@
* Note that this file does not replace Parse_rule.ml nor Rule.ml. We still
* want to accept the old syntax in Parse_rule.ml and also parse with
* position information and error recovery which ATD does not provide.
* This files does not replace either (yet) rule_schema_v1.yml which is
* more complete.
*
* TODO:
* - taint
* - extract
* - r2c-internal-project-depends-on-content
* - secrets
* - generalized taint
* - steps (and join?)
* - generalized taint
* - new metavariable types
* - new 'anywhere:'
*)

(*****************************************************************************)
Expand All @@ -51,7 +54,6 @@ type glob = string
(* ex: "[a-zA-Z_]*\\.c" *)
type regex = string


(*****************************************************************************)
(* The rule *)
(*****************************************************************************)
Expand All @@ -65,14 +67,13 @@ type rule = {
(* TODO: selector vs analyzer *)
languages: language list;

(* at least one of those must be set *)
(* CHECK: exactly one of those fields must be set *)
?match_ <json name="match">: formula option;
?taint: taint_spec option;
?extract: extract option;
(* TODO: join, steps, secrets, sca *)

~mode <ocaml default="`Search">: mode;
(* TODO: product: product *)
(* TODO? product: product; *)

(* TODO? could be replaced by a pattern-filename: *)
?paths: paths option;
Expand All @@ -81,7 +82,7 @@ type rule = {
?fix_regex: fix_regex option;

?metadata: raw_json option;
?options: options option;
?options: rule_options option;

?version: version option;
?min_version: version option;
Expand All @@ -96,15 +97,6 @@ type rule_id = string wrap <ocaml module="Rule_ID">
(* Version_info.t *)
type version = string (* TODO wrap <ocaml module="ATDStringWrap.Version"> *)

type mode = [
| Search <json name="search">
| Taint <json name="taint">
| Join <json name="join">
| Extract <json name="extract">
| SemgrepInternalPostprocessor <json name="semgrep_internal_postprocessor">
(* TODO: Steps, SCA? *)
]

(*****************************************************************************)
(* Types of rule fields *)
(*****************************************************************************)
Expand Down Expand Up @@ -189,87 +181,49 @@ type fix_regex = {
?count: int option;
}

type options <ocaml from="Rule_options" t="t"> = abstract
type rule_options <ocaml from="Rule_options" t="t"> = abstract

(*****************************************************************************)
(* Search mode (default) and formula *)
(* Formula *)
(*****************************************************************************)

(* 'formula' below is handled by a <json adapter.ocaml=...> because there is no
* way to encode directly using ATD the way we chose to represent formulas
* in YAML/JSON. Indeed, because Yaml/JSON does not support Algebraic data
* types (ADTs), we used a weird encoding abusing objects to represent
* formulas, e.g.,
*
* any:
* - and:
* - "foo"
* - pattern: "bar"
* where:
* - bla
*
* which when turned into JSON gives:
*
* { any: [
* { and: [
* "foo",
* {pattern: "bar" }
* ],
* where: [ bla ],
* }
* ]
* }
*
* The ATD way would be to encode a formula as
* in YAML/JSON.
*
* {f: ["Any", [
* {f: ["And", [
* {f: ["Pattern", "foo"]},
* {f: ["Pattern", "bar"]},
* ]],
* where: [bla]
* }
* ]]
* }
* alt: instead of using those ?all, ?regex, and CHECK:, we could use a
* proper variant, but that would require a more complex adapter and the
* distance between the spec and the actual syntax would be even longer.
*
* So we need rule_schema_v2_adapter.ml used below to transform the first JSON
* in the second dynamically at parsing time, so then ATD can parse it
* using the formula type specified below.
*)

* old: this type was called new-pattern in rule_schema_v1.yaml
*)
type formula = {
(* alt: have ?all: ... ?any: ... ?regex: ... ?pattern: ... with a check
* at parsing time that only one of those fields is given.
*)
f: formula_bis;
(* alt: we could instead do 'All of formula list * condition list' below
* but syntactically we also allow 'where' with pattern:, regex:, etc.
* as in
* { pattern: ..., where: ..., }
* Even though internally in Rule.ml a { pattern: X, where: Y}
* is transformed in an All [pattern: X, Y], in the syntax
* we allow it in more places.
(* CHECK: exactly one of those fields must be set *)
(* either directly a string or pattern: string in the JSON *)
?pattern: string option;
?regex: regex option;
?all: formula list option;
?any: formula list option;
(* CHECK: not/inside/anywhere can appear only inside an all: *)
?not: formula option;
?inside: formula option;
?anywhere: formula option;
(* TODO? ?taint: taint_spec *)

(* alt: we could instead do '?all: formula list option * condition list'
* above, but syntactically we also allow 'where' with pattern:, regex:,
* etc. as in:
*
* - pattern: "foo($X)"
* where: ...
*
* In fact that's the main reason we sometimes have to use pattern: string
* instead of a string because where: could not be attached to a string.
*)
~where: condition list;
}
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">

(* old: this type was called new-pattern in rule_schema_v1.yaml but formula in
* Rule.ml
*)
type formula_bis = [
(* either directly a string or pattern: string in the JSON *)
| Pattern <json name="pattern"> of string
| Regex <json name="regex"> of regex
(* 'And of conjunction' in Rule.ml *)
| All <json name="all"> of formula list
| Any <json name="any"> of formula list
(* Not and Inside can appear only inside an All *)
| Not <json name="not"> of formula
| Inside <json name="inside"> of formula
(* TODO? Taint of taint_spec *)
]

(* Just like for formula, we're using an adapter to transform
* conditions in YAML like:
*
Expand All @@ -288,7 +242,7 @@ type formula_bis = [
* which we must transform in an ATD-compliant:
*
* [ ["M", [{ metavariable: $X,
* c: ["regex", $Z]
* regex: $Z
* }]
* ]]
*)
Expand All @@ -299,10 +253,10 @@ type condition = [
]
<json adapter.ocaml="Rule_schema_v2_adapter.Condition">

(* either a single string or an array in JSON, that is
* {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
*)
type focus = {
(* either a single string or an array in JSON, that is
* {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
*)
focus: mvar list;
}

Expand All @@ -316,39 +270,43 @@ type comparison = {

type metavariable_cond = {
metavariable: mvar;
(* alt: have ?type: ... ?types:... ?regex: ... *)
c: metavariable_cond_bis;
}

type metavariable_cond_bis = [
| Type <json name="type"> of string
| Types <json name="types"> of string list
(* alt: we could remove Regex as Formula itself as a Regex
(* CHECK: exactly one of those fields must be set *)
?type_ <json name="type">: string option;
?types: string list option;
(* this covers regex:, pattern:, but also any formula.
* TODO: for metavariable-regex, can also enable constant_propagation
* TOOD: we should accept also language: string
*)
| Regex <json name="regex"> of regex
(* TODO: accept also language: string *)
| Formula <json name="F"> of formula
| Analyzer <json name="analyzer"> of analyzer
]
inherit formula;
?analyzer: analyzer option;
}

type analyzer = [
| Entropy <json name="entropy">
| Redos <json name="redos">
]

(*****************************************************************************)
(* Taint mode *)
(* TODO: Tainting *)
(*****************************************************************************)

type taint_spec = raw_json

(*****************************************************************************)
(* Extract mode *)
(* TODO: SSC *)
(*****************************************************************************)

(*****************************************************************************)
(* TODO: Extract mode *)
(*****************************************************************************)

type extract = raw_json

(*****************************************************************************)
(* TODO: Secrets *)
(*****************************************************************************)

(*****************************************************************************)
(* Toplevel *)
(*****************************************************************************)
Expand All @@ -357,7 +315,7 @@ type rules = {
rules: rule list;

(* Missed count of pro rules when not logged-in.
* Sent by the registry to the CLI since 1.48.
* Sent by the registry to the CLI since 1.49.
* See https://github.com/semgrep/semgrep-app/pull/11142
*)
?missed: int option;
Expand Down
37 changes: 12 additions & 25 deletions rule_schema_v2_adapter.ml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,8 @@ module Formula = struct
let normalize (orig : Yojson.Safe.t ) : Yojson.Safe.t =
match orig with
| `String str ->
`Assoc ["f", `List [`String "pattern"; `String str]]
| `Assoc [(key , elt)] ->
`Assoc ["f", `List [`String key; elt]]
| `Assoc [(key , elt); ("where", stuff)] ->
`Assoc [
("f", `List [`String key; elt]);
("where", stuff)]
`Assoc ["pattern", `String str]
(* TODO: check at least one of any/all/... is specified *)
| x -> x

(** Convert from ATD-compatible json to original json *)
Expand All @@ -31,25 +26,17 @@ module Condition = struct
| `Assoc (("comparison", cmp)::rest) ->
`List [`String "C";
`Assoc (("comparison", cmp)::rest)]
| `Assoc [("metavariable", mvar); ("regex", reg)] ->
| `Assoc (("metavariable", mvar)::rest) ->
(* TODO: check at least one of type/types/... is specified *)
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "regex"; reg])]]
| `Assoc [("metavariable", mvar); ("type", ty)] ->
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "type"; ty])]]
| `Assoc [("metavariable", mvar); ("types", tys)] ->
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "types"; tys])]]
| `Assoc [("metavariable", mvar); ("pattern", p)] ->
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "F"; `Assoc [("pattern", p)]])]]
| x ->
x

`Assoc (("metavariable", mvar)::rest)]
| `Assoc [("focus", `String x)] ->
`List [`String "F";
`Assoc [("focus", `List [`String x])]]
| `Assoc [("focus", `List x)] ->
`List [`String "F";
`Assoc [("focus", `List x)]]
| x -> x

(** Convert from ATD-compatible json to original json *)
let restore (_atd : Yojson.Safe.t) : Yojson.Safe.t =
Expand Down

0 comments on commit e655eef

Please sign in to comment.