Skip to content

Commit 5e0c767

Browse files
authored
New input type for semgrep-core allows taking scanning roots instead of target files (#337)
This is used by semgrep/semgrep-proprietary#2878. - [x] I ran `make setup && make` to update the generated code after editing a `.atd` file (TODO: have a CI check) - [x] I made sure we're still backward compatible with old versions of the CLI. For example, the Semgrep backend need to still be able to *consume* data generated by Semgrep 1.50.0. See https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades Note that the types related to the semgrep-core JSON output or the semgrep-core RPC do not need to be backward compatible!
1 parent a8fc7ca commit 5e0c767

8 files changed

+1650
-92
lines changed

scripts/check-backwards-compatibility

+26-11
Original file line numberDiff line numberDiff line change
@@ -34,34 +34,49 @@ for tag in $tags; do
3434

3535
set +e # do our own error handling for a bit
3636
echo "Checking backward compatibility of semgrep_output_v1.atd against past version $tag"
37-
# I'm getting an exit code 128 when atddiff returns 3 (as of git 2.43.0),
38-
# contrary to what 'git difftool --help' promises for '--trust-exit-code'.
39-
# I'd report the bug if it was easier. -- Martin
40-
git difftool --trust-exit-code -x 'atddiff --no-locations --backward' -y \
37+
38+
# We don't check for incompatibilities in the pysemgrep/semgrep-core
39+
# interface because these two programs are always distributed together,
40+
# allowing their interface to change freely.
41+
#
42+
# --exit-success: exit 0 even if differences are found.
43+
#
44+
# TODO: add '--ignore ...' so as to produce
45+
# an error if we fail to update the '--types' option when adding
46+
# future new type definitions. This requires atddiff > 2.15.
47+
#
48+
# --ignore core_output,targets,function_call,function_return
49+
#
50+
atddiff_options="--exit-success --no-locations --backward --types ci_scan_complete,ci_scan_complete_response,ci_scan_failure,ci_scan_results,ci_scan_results_response,cli_output,datetime,deployment_response,diff_files,partial_scan_result,scan_config,scan_request,scan_response,tests_result"
51+
52+
git difftool --trust-exit-code -x "atddiff $atddiff_options" -y \
4153
"$tag" "origin/main" -- semgrep_output_v1.atd > before.txt
54+
# The exit code is 0 if atddiff's exit code is 0, and it's an
55+
# unspecified nonzero value if atddiff's exit code is not 0 (but
56+
# not the same nonzero value!)
4257
ret=$?
43-
if [ "$ret" -ge 1 ] && [ "$ret" -le 2 ]; then
44-
echo "ERROR: atddiff had an error: $ret"
58+
if [[ "$ret" -ne 0 ]]; then
59+
echo "ERROR: atddiff had an error: git difftool exit $ret"
4560
cat before.txt
4661
exit 1
4762
fi
48-
git difftool --trust-exit-code -x 'atddiff --no-locations --backward' -y \
63+
git difftool --trust-exit-code -x "atddiff $atddiff_options" -y \
4964
"$tag" "HEAD" -- semgrep_output_v1.atd > after.txt
5065
ret=$?
51-
if [ "$ret" -ge 1 ] && [ "$ret" -le 2 ]; then
52-
echo "ERROR: atddiff had an error: $ret"
66+
if [[ "$ret" -ne 0 ]]; then
67+
echo "ERROR: atddiff had an error: git difftool exit $ret"
5368
cat after.txt
5469
exit 1
5570
fi
5671

5772
diff -u before.txt after.txt
58-
if [ "$?" -ne 0 ]; then
73+
if [[ "$?" -ne 0 ]]; then
5974
echo "ERROR: semgrep_output_v1.atd is not backward compatible with $tag"
6075
errors=$((errors + 1))
6176
fi
6277
set -e
6378
done
6479

65-
if [ "$errors" -ne 0 ]; then
80+
if [[ "$errors" -ne 0 ]]; then
6681
exit 1
6782
fi

semgrep_output_v1.atd

+50-3
Original file line numberDiff line numberDiff line change
@@ -1864,8 +1864,45 @@ type core_error <python decorator="dataclass(frozen=True)"> = {
18641864
those different files (because ATD does not have a proper module system yet).
18651865
*)
18661866

1867-
type analyzer <ocaml attr="deriving show"> =
1868-
string wrap <ocaml module="Analyzer">
1867+
(* See Scan_CLI.ml on how to convert command-line options to this *)
1868+
type project_root <ocaml attr="deriving show"> = [
1869+
(* path *)
1870+
| Filesystem of string
1871+
(* URL *)
1872+
| Git_remote of string
1873+
]
1874+
1875+
(*
1876+
This type is similar to the type Find_targets.conf used by osemgrep.
1877+
1878+
We could share the type but it would be slightly more complicated.
1879+
This solution will be easier to undo when we're fully migrated to osemgrep.
1880+
1881+
It encodes options derived from the pysemgrep command line.
1882+
Upon receiving this record, semgrep-core will discover the target
1883+
files like osemgrep does.
1884+
1885+
- See Find_targets.mli for the meaning of each field.
1886+
- See Scan_CLI.ml for the mapping between semgrep CLI and this type.
1887+
*)
1888+
type targeting_conf <ocaml attr="deriving show"> = {
1889+
exclude : string list;
1890+
?include_ : string list option;
1891+
max_target_bytes : int;
1892+
respect_gitignore : bool;
1893+
respect_semgrepignore_files : bool;
1894+
always_select_explicit_targets : bool;
1895+
(* This is a hash table in Find_targets.conf: *)
1896+
explicit_targets : string list;
1897+
(* osemgrep-only: option (see Git_project.ml and the force_root parameter) *)
1898+
?force_project_root : project_root option;
1899+
force_novcs_project : bool;
1900+
exclude_minified_files : bool;
1901+
?baseline_commit : string option;
1902+
diff_depth : int;
1903+
}
1904+
1905+
type analyzer <ocaml attr="deriving show"> = string wrap <ocaml module="Analyzer">
18691906

18701907
(* A target can either be a traditional code target (now with optional
18711908
associated lockfile) or it can be a lockfile target, which will be used to
@@ -1895,6 +1932,11 @@ type code_target <ocaml attr="deriving show"> = {
18951932
?lockfile_target: lockfile option;
18961933
}
18971934

1935+
type scanning_roots <ocaml attr="deriving show"> = {
1936+
root_paths: fpath list;
1937+
targeting_conf: targeting_conf;
1938+
}
1939+
18981940
(* The same path can be present multiple times in targets below, with
18991941
* different languages each time, so a Python file can be both analyzed
19001942
* with Python rules, but also with generic/regexp rules.
@@ -1904,7 +1946,12 @@ type code_target <ocaml attr="deriving show"> = {
19041946
* you could have at most one PL language, and then possibly
19051947
* "generic" and "regexp".
19061948
*)
1907-
type targets <ocaml attr="deriving show"> = target list
1949+
type targets <ocaml attr="deriving show"> = [
1950+
(* list of paths used to discover targets *)
1951+
| Scanning_roots of scanning_roots
1952+
(* targets already discovered from the scanning roots by pysemgrep *)
1953+
| Targets of target list
1954+
]
19081955

19091956
(*****************************************************************************)
19101957
(* Python -> OCaml RPC typedefs *)

semgrep_output_v1.jsonschema

+73-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

semgrep_output_v1.proto

+21-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)