Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvement to rule_schema_v2.atd #186

Merged
merged 4 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 141 additions & 44 deletions rule_schema_v2.atd
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
(* New Semgrep syntax (hence the v2) specified using ATD instead of jsonschema.
*
* For more information on the new syntax, see:
* - Brandon's community Slack post announcing the new syntax
* https://semgrep.slack.com/archives/C018NJRRCJ0/p1698430726062769?thread_ts=1698350734.415849&cid=C018NJRRCJ0
* - Brandon's slides
* https://docs.google.com/presentation/d/1zzmyFbfNlJqweyzuuFlo4zpSs3Gqhfi6FiNRONSEQ0E/edit#slide=id.g1eee710cdbf_0_26
* - Pieter's video
* https://www.youtube.com/watch?v=dZUPjFvknnI
* - Parsia's blog post
* https://parsiya.net/blog/2023-10-28-semgreps-experimental-rule-syntax/
*
* Note that even if most Semgrep users use YAML to write a rule, and not JSON,
* we still use a JSON tool (here ATD, but also jsonschema) to specify
* the rule schema because YAML is a superset of JSON and can be
* mechanically translated into JSON.
* mechanically translated into JSON; there is no yamlschema
* (see https://json-schema-everywhere.github.io/yaml). To add even more
* confusion, a jsonschema can actually be specified using YAML (like in
* rule_shema_v1.yml), and so one can use YAML syntax to specify the
* JSON schema of files actually written in YAML (hmmm).
*
* Jsonschema is powerful but also arguably complicated and so it
* might be simpler for many Semgrep developers (and also some Semgrep
* users) to use ATD to specify and understand the schema of a rule.
* It could provide a better basis to think about future syntax extensions.
*
* This file is now also used for some rule validation in
* `semgrep --validate --develop`.
*
Expand All @@ -23,37 +38,52 @@
* - steps (and join?)
*)

(*****************************************************************************)
(* Basic types and string aliases *)
(*****************************************************************************)

(* escape hatch *)
type raw_json <ocaml module="Yojson.Basic" t="t"> = abstract

(* ex: "*.c" *)
type glob = string

(* ex: "[a-zA-Z_]*\\.c" *)
type regex = string


(*****************************************************************************)
(* The rule *)
(*****************************************************************************)

type rule = {
id: rule_id;

message: string;
severity: severity;

(* TODO: selector vs analyzer *)
languages: language list;

(* at least one of those must be set *)
?match_ <json name="match">: formula option;
?taint: taint_spec option;
?extract: extract option;
(* TODO: join, steps, secrets, sca *)

~mode <ocaml default="`Search">: mode;
(* TODO: product: product *)

(* TODO? could be replaced by a pattern-filename: *)
?paths: paths option;

?fix: string option;
?fix_regex: fix_regex option;

?metadata: raw_json option;
?options: options option;

?version: version option;
(* since ?? *)
?min_version: version option;
?max_version: version option;

Expand All @@ -76,20 +106,22 @@ type mode = [
]

(*****************************************************************************)
(* Extra fields *)
(* Types of rule fields *)
(*****************************************************************************)

(* coupling: semgrep_output_v1.atd with match_severity *)
type severity = [
| Error <json name="ERROR">
| Warning <json name="WARNING">
| Info <json name="INFO">
(* should not be used *)
| Experiment <json name="EXPERIMENT">
| Inventory <json name="INVENTORY">
]

(* coupling: language.ml *)
type language = [
(* programming (and configuration) languages *)
| Apex <json name="apex">
| Bash <json name="bash">
| Sh <json name="sh">
Expand Down Expand Up @@ -126,8 +158,6 @@ type language = [
| Py <json name="py">
| Python <json name="python">
| R <json name="r">
| Regex <json name="regex">
| None <json name="none">
| Ruby <json name="ruby">
| Rust <json name="rust">
| Scala <json name="scala">
Expand All @@ -142,17 +172,17 @@ type language = [
| Typescript <json name="typescript">
| Vue <json name="vue">
| Yaml <json name="yaml">

(* not regular programming languages *)
| Regex <json name="regex">
| None <json name="none">
]

type paths = {
~include_ <json name="include">: glob list;
~exclude: glob list;
}

type glob = string

type regex = string

type fix_regex = {
regex: regex;
replacement: string;
Expand All @@ -175,61 +205,106 @@ type options <ocaml from="Rule_options" t="t"> = abstract
* - and:
* - "foo"
* - pattern: "bar"
* where:
* - bla
*
* which when turned into JSON gives:
*
* { any: [
* { and: [
* "foo",
* {pattern: "bar" }
* ] }
* ] }
* "foo",
* {pattern: "bar" }
* ],
* where: [ bla ],
* }
* ]
* }
*
* The ATD way would be to encode a formula as
*
* ["Any", [
* ["And", [
* ["Pattern", "foo"],
* ["Pattern", "bar"],
* {f: ["Any", [
* {f: ["And", [
* {f: ["Pattern", "foo"]},
* {f: ["Pattern", "bar"]},
* ]],
* where: [bla]
* }
* ]]
* ]]
* }
*
* So we need rule_schema_v2_adapter.ml used below to transform the first JSON
* in the second dynamically at parsing time, so then ATD can parse it
* using the formula type specified below.
*
* old: this type was called new-pattern in rule_schema_v1.yaml but formula in
* Rule.ml
*)
type formula = [

type formula = {
(* alt: have ?all: ... ?any: ... ?regex: ... ?pattern: ... with a check
* at parsing time that only one of those fields is given.
*)
f: formula_bis;
(* alt: we could instead do 'All of formula list * condition list' below
* but syntactically we also allow 'where' with pattern:, regex:, etc.
* as in
* { pattern: ..., where: ..., }
* Even though internally in Rule.ml a { pattern: X, where: Y}
* is transformed in an All [pattern: X, Y], in the syntax
* we allow it in more places.
*)
~where: condition list;
}
<json adapter.ocaml="Rule_schema_v2_adapter.Formula">

(* old: this type was called new-pattern in rule_schema_v1.yaml but formula in
* Rule.ml
*)
type formula_bis = [
(* either directly a string or pattern: string in the JSON *)
| Pattern <json name="pattern"> of string
| Regex <json name="regex"> of regex
(* 'All of conjunction' in Rule.ml *)
| All <json name="all"> of (formula list * where list)
(* 'And of conjunction' in Rule.ml *)
| All <json name="all"> of formula list
| Any <json name="any"> of formula list
(* Not and Inside can appear only inside an All *)
| Not <json name="not"> of formula
| Inside <json name="inside"> of formula
(* TODO? Taint of taint_spec *)
] <json adapter.ocaml="Rule_schema_v2_adapter.Formula">
]

(* In rule_schema_v1.yaml the 'where:' can be attached to all:
* as in:
* { all: ..., where: ...,}
* but also
* { pattern: ...,where: ..., }
(* Just like for formula, we're using an adapter to transform
* conditions in YAML like:
*
* where:
* - metavariable: $X
* regex: $Z
*
* which when turned into JSON gives:
*
* { where: [
* { metavariable: $X,
* regex: $Z
* }
* ] }
*
* which we must transform in an ATD-compliant:
*
* Internally though, the second one is translated into an And.
* [ ["M", [{ metavariable: $X,
* c: ["regex", $Z]
* }]
* ]]
*)
type where = [
(* either a single string or an array in JSON, that is
* {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
*)
| Focus of mvar list
| Comparison of comparison
| Metavariable of (mvar * metavariable_cond)
type condition = [
| Focus <json name="F"> of focus
| Comparison <json name="C"> of comparison
| Metavariable <json name="M"> of metavariable_cond
]
<json adapter.ocaml="Rule_schema_v2_adapter.Where">
<json adapter.ocaml="Rule_schema_v2_adapter.Condition">

(* either a single string or an array in JSON, that is
* {focus: "$FOO"}, but also {focus: ["$FOO", "$BAR"]}
*)
type focus = {
focus: mvar list;
}

type mvar = string

Expand All @@ -239,11 +314,27 @@ type comparison = {
~strip: bool;
}

type metavariable_cond = [
| Type of string
(* TODO: for metavariable-regex, can also enable constant_propagation *)
| Formula of formula
| Analyzer of string
type metavariable_cond = {
metavariable: mvar;
(* alt: have ?type: ... ?types:... ?regex: ... *)
c: metavariable_cond_bis;
}

type metavariable_cond_bis = [
| Type <json name="type"> of string
| Types <json name="types"> of string list
(* alt: we could remove Regex as Formula itself as a Regex
* TODO: for metavariable-regex, can also enable constant_propagation
*)
| Regex <json name="regex"> of regex
(* TODO: accept also language: string *)
| Formula <json name="F"> of formula
| Analyzer <json name="analyzer"> of analyzer
]

type analyzer = [
| Entropy <json name="entropy">
| Redos <json name="redos">
]

(*****************************************************************************)
Expand All @@ -264,4 +355,10 @@ type extract = raw_json

type rules = {
rules: rule list;

(* Missed count of pro rules when not logged-in.
* Sent by the registry to the CLI since 1.48.
* See https://github.com/semgrep/semgrep-app/pull/11142
*)
?missed: int option;
}
38 changes: 33 additions & 5 deletions rule_schema_v2_adapter.ml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ module Formula = struct
let normalize (orig : Yojson.Safe.t ) : Yojson.Safe.t =
match orig with
| `String str ->
`List [`String "pattern"; `String str]
`Assoc ["f", `List [`String "pattern"; `String str]]
| `Assoc [(key , elt)] ->
`Assoc ["f", `List [`String key; elt]]
| `Assoc [(key , elt); ("where", stuff)] ->
`Assoc [
("f", `List [`String key; elt]);
("where", stuff)]
| x -> x

(** Convert from ATD-compatible json to original json *)
Expand All @@ -17,14 +23,36 @@ module Formula = struct
failwith "Rule_schema_v2_adapter.Formula.restore not implemented"
end

module Where = struct
module Condition = struct

(** Convert from original json to ATD-compatible json *)
let normalize (_orig : Yojson.Safe.t ) : Yojson.Safe.t =
failwith "TODO: Where.normalize"
let normalize (orig : Yojson.Safe.t ) : Yojson.Safe.t =
match orig with
| `Assoc (("comparison", cmp)::rest) ->
`List [`String "C";
`Assoc (("comparison", cmp)::rest)]
| `Assoc [("metavariable", mvar); ("regex", reg)] ->
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "regex"; reg])]]
| `Assoc [("metavariable", mvar); ("type", ty)] ->
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "type"; ty])]]
| `Assoc [("metavariable", mvar); ("types", tys)] ->
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "types"; tys])]]
| `Assoc [("metavariable", mvar); ("pattern", p)] ->
`List [`String "M";
`Assoc [("metavariable", mvar);
("c", `List [`String "F"; `Assoc [("pattern", p)]])]]
| x ->
x


(** Convert from ATD-compatible json to original json *)
let restore (_atd : Yojson.Safe.t) : Yojson.Safe.t =
(* not needed for now; we care just about parsing *)
failwith "Rule_schema_v2_adapter.Where.restore not implemented"
failwith "Rule_schema_v2_adapter.Condition.restore not implemented"
end