Skip to content

Commit

Permalink
New input type for semgrep-core allows taking scanning roots instead …
Browse files Browse the repository at this point in the history
…of target files (#337)

This is used by
semgrep/semgrep-proprietary#2878.

- [x] I ran `make setup && make` to update the generated code after
editing a `.atd` file (TODO: have a CI check)
- [x] I made sure we're still backward compatible with old versions of
the CLI.
For example, the Semgrep backend need to still be able to *consume* data
	  generated by Semgrep 1.50.0.
See
https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades
	  Note that the types related to the semgrep-core JSON output or the
	  semgrep-core RPC do not need to be backward compatible!
  • Loading branch information
mjambon authored Jan 22, 2025
1 parent a8fc7ca commit 5e0c767
Show file tree
Hide file tree
Showing 8 changed files with 1,650 additions and 92 deletions.
37 changes: 26 additions & 11 deletions scripts/check-backwards-compatibility
Original file line number Diff line number Diff line change
Expand Up @@ -34,34 +34,49 @@ for tag in $tags; do

set +e # do our own error handling for a bit
echo "Checking backward compatibility of semgrep_output_v1.atd against past version $tag"
# I'm getting an exit code 128 when atddiff returns 3 (as of git 2.43.0),
# contrary to what 'git difftool --help' promises for '--trust-exit-code'.
# I'd report the bug if it was easier. -- Martin
git difftool --trust-exit-code -x 'atddiff --no-locations --backward' -y \

# We don't check for incompatibilities in the pysemgrep/semgrep-core
# interface because these two programs are always distributed together,
# allowing their interface to change freely.
#
# --exit-success: exit 0 even if differences are found.
#
# TODO: add '--ignore ...' so as to produce
# an error if we fail to update the '--types' option when adding
# future new type definitions. This requires atddiff > 2.15.
#
# --ignore core_output,targets,function_call,function_return
#
atddiff_options="--exit-success --no-locations --backward --types ci_scan_complete,ci_scan_complete_response,ci_scan_failure,ci_scan_results,ci_scan_results_response,cli_output,datetime,deployment_response,diff_files,partial_scan_result,scan_config,scan_request,scan_response,tests_result"

git difftool --trust-exit-code -x "atddiff $atddiff_options" -y \
"$tag" "origin/main" -- semgrep_output_v1.atd > before.txt
# The exit code is 0 if atddiff's exit code is 0, and it's an
# unspecified nonzero value if atddiff's exit code is not 0 (but
# not the same nonzero value!)
ret=$?
if [ "$ret" -ge 1 ] && [ "$ret" -le 2 ]; then
echo "ERROR: atddiff had an error: $ret"
if [[ "$ret" -ne 0 ]]; then
echo "ERROR: atddiff had an error: git difftool exit $ret"
cat before.txt
exit 1
fi
git difftool --trust-exit-code -x 'atddiff --no-locations --backward' -y \
git difftool --trust-exit-code -x "atddiff $atddiff_options" -y \
"$tag" "HEAD" -- semgrep_output_v1.atd > after.txt
ret=$?
if [ "$ret" -ge 1 ] && [ "$ret" -le 2 ]; then
echo "ERROR: atddiff had an error: $ret"
if [[ "$ret" -ne 0 ]]; then
echo "ERROR: atddiff had an error: git difftool exit $ret"
cat after.txt
exit 1
fi

diff -u before.txt after.txt
if [ "$?" -ne 0 ]; then
if [[ "$?" -ne 0 ]]; then
echo "ERROR: semgrep_output_v1.atd is not backward compatible with $tag"
errors=$((errors + 1))
fi
set -e
done

if [ "$errors" -ne 0 ]; then
if [[ "$errors" -ne 0 ]]; then
exit 1
fi
53 changes: 50 additions & 3 deletions semgrep_output_v1.atd
Original file line number Diff line number Diff line change
Expand Up @@ -1864,8 +1864,45 @@ type core_error <python decorator="dataclass(frozen=True)"> = {
those different files (because ATD does not have a proper module system yet).
*)

type analyzer <ocaml attr="deriving show"> =
string wrap <ocaml module="Analyzer">
(* See Scan_CLI.ml on how to convert command-line options to this *)
type project_root <ocaml attr="deriving show"> = [
(* path *)
| Filesystem of string
(* URL *)
| Git_remote of string
]

(*
This type is similar to the type Find_targets.conf used by osemgrep.

We could share the type but it would be slightly more complicated.
This solution will be easier to undo when we're fully migrated to osemgrep.

It encodes options derived from the pysemgrep command line.
Upon receiving this record, semgrep-core will discover the target
files like osemgrep does.

- See Find_targets.mli for the meaning of each field.
- See Scan_CLI.ml for the mapping between semgrep CLI and this type.
*)
type targeting_conf <ocaml attr="deriving show"> = {
exclude : string list;
?include_ : string list option;
max_target_bytes : int;
respect_gitignore : bool;
respect_semgrepignore_files : bool;
always_select_explicit_targets : bool;
(* This is a hash table in Find_targets.conf: *)
explicit_targets : string list;
(* osemgrep-only: option (see Git_project.ml and the force_root parameter) *)
?force_project_root : project_root option;
force_novcs_project : bool;
exclude_minified_files : bool;
?baseline_commit : string option;
diff_depth : int;
}

type analyzer <ocaml attr="deriving show"> = string wrap <ocaml module="Analyzer">

(* A target can either be a traditional code target (now with optional
associated lockfile) or it can be a lockfile target, which will be used to
Expand Down Expand Up @@ -1895,6 +1932,11 @@ type code_target <ocaml attr="deriving show"> = {
?lockfile_target: lockfile option;
}

type scanning_roots <ocaml attr="deriving show"> = {
root_paths: fpath list;
targeting_conf: targeting_conf;
}

(* The same path can be present multiple times in targets below, with
* different languages each time, so a Python file can be both analyzed
* with Python rules, but also with generic/regexp rules.
Expand All @@ -1904,7 +1946,12 @@ type code_target <ocaml attr="deriving show"> = {
* you could have at most one PL language, and then possibly
* "generic" and "regexp".
*)
type targets <ocaml attr="deriving show"> = target list
type targets <ocaml attr="deriving show"> = [
(* list of paths used to discover targets *)
| Scanning_roots of scanning_roots
(* targets already discovered from the scanning roots by pysemgrep *)
| Targets of target list
]

(*****************************************************************************)
(* Python -> OCaml RPC typedefs *)
Expand Down
75 changes: 73 additions & 2 deletions semgrep_output_v1.jsonschema

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 21 additions & 1 deletion semgrep_output_v1.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 5e0c767

Please sign in to comment.