Skip to content

Commit 2a998e7

Browse files
authored
Define new sca_error intermediate type and sca_resolution_error and ScaParseError (#350)
test plan: see related PR in semgrep - [x] I ran `make setup && make` to update the generated code after editing a `.atd` file (TODO: have a CI check) - [x] I made sure we're still backward compatible with old versions of the CLI. For example, the Semgrep backend need to still be able to *consume* data generated by Semgrep 1.50.0. See https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades Note that the types related to the semgrep-core JSON output or the semgrep-core RPC do not need to be backward compatible!
1 parent 56950a4 commit 2a998e7

7 files changed

+4147
-3558
lines changed

semgrep_output_v1.atd

Lines changed: 90 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
* semgrep_metrics.atd
2222
* - The parsing stats of semgrep-core -parsing_stats -json have its own
2323
* Parsing_stats.atd
24-
* - The schema for the generic AST dump is in AST_generic_v1.atd
2524
* For the definition of the Semgrep input (the rules), see rule_schema_v2.atd
2625
*
2726
* This file has the _v1 suffix to explicitely represent the
@@ -254,7 +253,7 @@ type product
254253
<python decorator="dataclass(frozen=True)"> =
255254
[
256255
| SAST (* a.k.a. Code *) <json name="sast">
257-
| SCA <json name="sca">
256+
| SCA (* a.k.a. SSC *) <json name="sca">
258257
| Secrets <json name="secrets">
259258
]
260259

@@ -533,8 +532,8 @@ type transitivity
533532
*)
534533
| Transitive <json name="transitive">
535534
(* If there is insufficient information to determine the transitivity,
536-
such as a requirements.txt file without a requirements.in manifest,
537-
we leave it Unknown.
535+
* such as a requirements.txt file without a requirements.in manifest,
536+
* we leave it Unknown.
538537
*)
539538
| Unknown <json name="unknown">
540539
]
@@ -549,7 +548,7 @@ type sca_match = {
549548
dependency_match: dependency_match;
550549
(* TODO: deprecate, we should use sca_match_kind instead *)
551550
reachable: bool;
552-
(* EXPERIMENTAL since 1.08.0 *)
551+
(* EXPERIMENTAL since 1.108.0 *)
553552
?kind: sca_match_kind option;
554553
}
555554

@@ -768,7 +767,7 @@ type error_type
768767
| PatternParseError0 <json name="Pattern parse error">
769768
| IncompatibleRule0 <json name="Incompatible rule">
770769
(* since semgrep 1.94.0 *)
771-
| DependencyResolutionError of resolution_error
770+
| DependencyResolutionError of resolution_error_kind
772771
] <ocaml repr="classic">
773772

774773
type incompatible_rule
@@ -867,11 +866,11 @@ type skip_reason <ocaml attr="deriving show"> = [
867866
| Irrelevant_rule <json name="irrelevant_rule">
868867
| Too_many_matches <json name="too_many_matches">
869868
(* New in osemgrep *)
870-
| Gitignore_patterns_match (* TODO: use JSON lowercase for consistency *)
869+
| Gitignore_patterns_match
871870
(* since 1.40.0. There were always ignored, but not shown in the skip report *)
872-
| Dotfile (* TODO: use JSON lowercase for consistency *)
871+
| Dotfile
873872
(* since 1.44.0 *)
874-
| Nonexistent_file (* TODO: use JSON lowercase for consistency *)
873+
| Nonexistent_file
875874
(* since 1.94.0 *)
876875
| Insufficient_permissions <json name="insufficient_permissions">
877876
] <ocaml repr="classic">
@@ -981,7 +980,6 @@ type target_times = {
981980
run_time: float;
982981
}
983982

984-
985983
(*****************************************************************************)
986984
(* Final 'semgrep scan' output *)
987985
(*****************************************************************************)
@@ -1056,7 +1054,6 @@ type tests_result = {
10561054
}
10571055

10581056
type checks = {
1059-
(* would like to use fpath *)
10601057
checks: (string (* rule_id *) * rule_result) list <json repr="object">;
10611058
}
10621059

@@ -1218,8 +1215,8 @@ type killing_parent = {
12181215
* - /api/agent/scans/<scan_id>/complete when done, with the exit code and a
12191216
* few more information and response with app_block_override and reason
12201217
*
1221-
* alt: we could move all of this in a separate semgrep_posts_v1.atd file
1222-
* or semgrep_webapp_v1.atd
1218+
* TODO: we should move all of this in a separate semgrep_backend.atd
1219+
* (but need a proper module system for ATD first)
12231220
*)
12241221

12251222
(* ----------------------------- *)
@@ -1230,11 +1227,12 @@ type features = {
12301227
~autofix: bool;
12311228
~deepsemgrep: bool;
12321229
~dependency_query: bool;
1230+
(* a.k.a. dependency path *)
12331231
~path_to_transitivity: bool;
1234-
(* normally we resolve dependencies for changed subprojects only in diff scans. This flag
1235-
* causes all subprojects to be resolved in diff scans *)
1232+
(* normally we resolve dependencies for changed subprojects only in diff
1233+
* scans. This flag causes all subprojects to be resolved in diff scans *)
12361234
~scan_all_deps_in_diff_scan: bool;
1237-
(* Whether to collect "symbol analysis" information from the repo being scanned
1235+
(* Whether to collect "symbol analysis" info from the repo being scanned
12381236
See https://www.notion.so/semgrep/Semgrep-Code-Reconnaissance-Toolbox-18a3009241a880f2a439eed6b2cffe66?pvs=4
12391237
*)
12401238
~symbol_analysis: bool;
@@ -1260,6 +1258,7 @@ type triage_ignored = {
12601258
* Note that the version of the CLI is sent to the WebApp in
12611259
* project_metadata so the backend has all the necessary information to
12621260
* send back an appropriate action depending on the CLI version.
1261+
* TODO: only osemgrep handles that right now
12631262
*)
12641263
type action = [
12651264
| Message of string
@@ -1295,7 +1294,10 @@ type scan_info = {
12951294

12961295
(* config specific to the scan *)
12971296
type scan_configuration = {
1298-
rules: raw_json; (* can we type this better *)
1297+
(* Rules sent from the backend. Note that those rules are in JSON
1298+
* form not YAML (which led to some speedup in pysemgrep)
1299+
* TODO? can we type this better *)
1300+
rules: raw_json;
12991301
inherit triage_ignored;
13001302
}
13011303

@@ -1322,9 +1324,8 @@ type product_ignored_files = (product * glob list) list
13221324
* "keys must be strings" error *)
13231325
<python repr="dict"> <ts repr="map">
13241326

1325-
(* configuration for scanning version control history,
1326-
* e.g., looking back at past git commits for committed credentials which may
1327-
* have been removed *)
1327+
(* configuration for scanning version control history, e.g., looking back at
1328+
* past git commits for committed credentials which may have been removed *)
13281329
type historical_configuration = {
13291330
enabled: bool;
13301331
?lookback_days: int option;
@@ -1425,9 +1426,10 @@ type scan_metadata = {
14251426
cli_version: version;
14261427
unique_id: uuid; (* client generated uuid for the scan *)
14271428
requested_products: product list;
1428-
~dry_run: bool; (* from 1.47.0 *)
1429-
(* since 1.96.0 *)
1429+
(* since 1.47.0 *)
1430+
~dry_run: bool;
14301431
(* unique id associated with the scan in Semgrep Managed Scanning *)
1432+
(* since 1.96.0 *)
14311433
?sms_scan_id: string option;
14321434
}
14331435

@@ -1466,6 +1468,7 @@ type finding = {
14661468
message: string;
14671469
(* int|string until minimum version exceeds 1.32.0. After 1.32.0
14681470
* we're always using an int.
1471+
* TODO: switch to int now that minimum version supported is 1.50.0
14691472
* TODO: should reuse match_severity instead of using an int here.
14701473
* This is what pysemgrep is currently using:
14711474
* Error = 2, Warning = 1, Experiment = 4, otherwise 0
@@ -1542,7 +1545,7 @@ type ci_scan_results = {
15421545
?contributions: contributions option;
15431546
(* since semgrep 1.38.0 *)
15441547
(* this data was originally sent to /complete, but we want to start sending
1545-
* it /results *)
1548+
* it to /results *)
15461549
?dependencies: ci_scan_dependencies option;
15471550
}
15481551

@@ -1585,6 +1588,7 @@ type ci_scan_complete = {
15851588
stats: ci_scan_complete_stats;
15861589
(* TODO: remove when min version is 1.38.0 *)
15871590
?dependencies: ci_scan_dependencies option;
1591+
(* TODO: move the errors in ci_scan_complete_stats.errors instead *)
15881592
?dependency_parser_errors: dependency_parser_error list option;
15891593
(* since 1.31.0 *)
15901594
?task_id: string option;
@@ -1596,10 +1600,12 @@ type ci_scan_complete_stats = {
15961600
errors: cli_error list;
15971601
total_time: float;
15981602

1603+
(* ?? *)
15991604
unsupported_exts: (string * int) list
16001605
<json repr="object">
16011606
<python repr="dict">
16021607
<ts repr="map">;
1608+
(* ?? *)
16031609
lockfile_scan_info: (string * int) list
16041610
<json repr="object">
16051611
<python repr="dict">
@@ -1657,11 +1663,11 @@ type ci_scan_complete_response <ocaml attr="deriving show"> = {
16571663
(* ----------------------------- *)
16581664
(* SCA part 2 *)
16591665
(* ----------------------------- *)
1666+
(* key is ?? lockfile? *)
16601667
type ci_scan_dependencies = (string * found_dependency list) list
1661-
<json repr="object">
1662-
<python repr="dict">
1663-
<ts repr="map">
1668+
<json repr="object"> <python repr="dict"> <ts repr="map">
16641669

1670+
(* TODO: get rid of; should use cli_error with error_type ScaParseError *)
16651671
type dependency_parser_error = {
16661672
path: fpath;
16671673
parser: sca_parser_name;
@@ -1674,33 +1680,34 @@ type dependency_parser_error = {
16741680
?text: string option;
16751681
}
16761682

1677-
(* json names are to maintain backwards compatibility with the python enum it
1678-
* is replacing
1679-
* TODO: use <ocaml repr="classic">
1683+
(* JSON names are to maintain backwards compatibility with the python enum it
1684+
* is replacing. The P prefix (for parser) is to avoid ambiguity with similar
1685+
* construtor names in the manifest and ecosystem types.
16801686
*)
1681-
type sca_parser_name = [
1682-
| Gemfile_lock <json name="gemfile_lock">
1683-
| Go_mod <json name="go_mod">
1684-
| Go_sum <json name="go_sum">
1685-
| Gradle_lockfile <json name="gradle_lockfile">
1686-
| Gradle_build <json name="gradle_build">
1687-
| Jsondoc <json name="jsondoc">
1688-
| Pipfile <json name="pipfile">
1689-
| Pnpm_lock <json name="pnpm_lock">
1690-
| Poetry_lock <json name="poetry_lock">
1691-
| Pyproject_toml <json name="pyproject_toml">
1692-
| Requirements <json name="requirements">
1693-
| Yarn_1 <json name="yarn_1">
1694-
| Yarn_2 <json name="yarn_2">
1695-
| Pomtree <json name="pomtree">
1696-
| Cargo_parser <json name="cargo">
1697-
| Composer_lock <json name="composer_lock">
1698-
| Pubspec_lock <json name="pubspec_lock">
1699-
| Package_swift <json name="package_swift">
1700-
| Podfile_lock <json name="podfile_lock">
1701-
| Package_resolved <json name="package_resolved">
1702-
| Mix_lock <json name="mix_lock">
1703-
]
1687+
type sca_parser_name <ocaml attr="deriving show"> = [
1688+
| PGemfile_lock <json name="gemfile_lock">
1689+
| PGo_mod <json name="go_mod">
1690+
| PGo_sum <json name="go_sum">
1691+
| PGradle_lockfile <json name="gradle_lockfile">
1692+
| PGradle_build <json name="gradle_build">
1693+
| PJsondoc <json name="jsondoc">
1694+
| PPipfile <json name="pipfile">
1695+
| PPnpm_lock <json name="pnpm_lock">
1696+
| PPoetry_lock <json name="poetry_lock">
1697+
| PPyproject_toml <json name="pyproject_toml">
1698+
| PRequirements <json name="requirements">
1699+
| PYarn_1 <json name="yarn_1">
1700+
| PYarn_2 <json name="yarn_2">
1701+
| PPomtree <json name="pomtree">
1702+
| PCargo_parser <json name="cargo">
1703+
| PComposer_lock <json name="composer_lock">
1704+
| PPubspec_lock <json name="pubspec_lock">
1705+
| PPackage_swift <json name="package_swift">
1706+
| PPodfile_lock <json name="podfile_lock">
1707+
| PPackage_resolved <json name="package_resolved">
1708+
| PMix_lock <json name="mix_lock">
1709+
] <ocaml repr="classic">
1710+
17041711

17051712
type supply_chain_stats = {
17061713
subprojects_stats: subproject_stats list;
@@ -1744,7 +1751,9 @@ type resolution_method
17441751
<ocaml attr="deriving show">
17451752
<python decorator="dataclass(frozen=True, order=True)"> =
17461753
[
1754+
(* we parsed a lockfile that was already included in the repository *)
17471755
| LockfileParsing
1756+
(* we communicated with the package manager to resolve dependencies *)
17481757
| DynamicResolution
17491758
]
17501759

@@ -1765,8 +1774,8 @@ type ci_scan_failure = {
17651774
(* Response by the backend to the CLI to the POST api/agent/deployments/current
17661775
* Some of the information in deployment_config is now returned
17671776
* directly in scan_response (e.g., the deployment_name)
1768-
* TODO: deprecate this endpoint as it is now used only in 'semgrep login' and
1769-
* in 'semgrep show whoami' to just check whether the token is valid.
1777+
* TODO? deprecate this endpoint as it is now used only in 'semgrep login' and
1778+
* in 'semgrep show whoami' to just check whether the token is valid?
17701779
*)
17711780
type deployment_config <ocaml attr="deriving show"> = {
17721781
id : int;
@@ -1901,7 +1910,7 @@ type core_match <python decorator="dataclass(frozen=True)"> = {
19011910
extra: core_match_extra;
19021911
}
19031912

1904-
(* TODO: try to make it as close as possible to 'cli_match_extra' below
1913+
(* TODO: try to make it as close as possible to 'cli_match_extra'.
19051914
* See the corresponding comment in cli_match_extra for more information
19061915
* about the fields below.
19071916
*)
@@ -1930,9 +1939,9 @@ type core_match_extra <python decorator="dataclass(frozen=True)"> = {
19301939
}
19311940

19321941
(* See Semgrep_error_code.ml *)
1933-
(* TODO: try to make it as close as possible to 'cli_error' above, possibly
1942+
(* TODO: try to make it as close as possible to 'cli_error', possibly
19341943
* extending cli_error with more fields (but those fields must be optional
1935-
* to remain backward compatible
1944+
* to remain backward compatible)
19361945
* LATER: use a proper variant in error_type so we would need less
19371946
* of those ?xxx types below (like a ParseError should always have a location)
19381947
*)
@@ -2006,7 +2015,7 @@ type analyzer <ocaml attr="deriving show"> = string wrap <ocaml module="Analyzer
20062015
(* A target can either be a traditional code target (now with optional
20072016
associated lockfile) or it can be a lockfile target, which will be used to
20082017
generate lockfile-only findings.
2009-
Currently *ALL TARGETS FROM OSEMGREP AND PYSEMGREP ARE CODETARGETS*
2018+
Currently *ALL TARGETS FROM PYSEMGREP ARE CODETARGETS*
20102019
coupling: with src/target/Target.mli
20112020
*)
20122021
type target <ocaml attr="deriving show"> = [
@@ -2239,15 +2248,34 @@ type dependency_source
22392248
| ManifestLockfileDependencySource of (manifest * lockfile)
22402249
] <ocaml repr="classic">
22412250

2242-
type resolution_error
2251+
(* alt: sca_error_kind *)
2252+
type resolution_error_kind
22432253
<ocaml attr="deriving show">
22442254
<python decorator="dataclass(frozen=True)"> =
22452255
[
22462256
| UnsupportedManifest
22472257
| MissingRequirement of string
22482258
| ResolutionCmdFailed of resolution_cmd_failed
2259+
(* when we produce some dependency list in lockfileless scanning (by talking
2260+
* to the package manager) but fail to parse it correctly *)
22492261
| ParseDependenciesFailed of string
2250-
]
2262+
(* a lockfile parser failed
2263+
* since semgrep 1.109.0 (to replace dependency_parser_error) *)
2264+
| ScaParseError of sca_parser_name
2265+
] <ocaml repr="classic">
2266+
2267+
(* used only from pysemgrep for now
2268+
* TODO? we should merge dependency_{parser,resolution}_error with cli_error
2269+
*)
2270+
type sca_resolution_error = {
2271+
type_: resolution_error_kind;
2272+
dependency_source_file: fpath;
2273+
}
2274+
type sca_error = [
2275+
| SCAParse of dependency_parser_error
2276+
| SCAResol of sca_resolution_error
2277+
] <ocaml repr="classic">
2278+
22512279

22522280
type resolution_cmd_failed
22532281
<ocaml attr="deriving show">
@@ -2262,8 +2290,8 @@ type resolution_cmd_failed
22622290
* lockfile parsing might partially fail but recover and still produce results)
22632291
*)
22642292
type resolution_result = [
2265-
| ResolutionOk of (found_dependency list * resolution_error list)
2266-
| ResolutionError of resolution_error list
2293+
| ResolutionOk of (found_dependency list * resolution_error_kind list)
2294+
| ResolutionError of resolution_error_kind list
22672295
]
22682296

22692297
type transitive_finding = {

0 commit comments

Comments
 (0)