Skip to content

Commit

Permalink
Define new sca_error intermediate type and sca_resolution_error and S…
Browse files Browse the repository at this point in the history
…caParseError (#350)

test plan:
see related PR in semgrep


- [x] I ran `make setup && make` to update the generated code after
editing a `.atd` file (TODO: have a CI check)
- [x] I made sure we're still backward compatible with old versions of
the CLI.
For example, the Semgrep backend need to still be able to *consume* data
	  generated by Semgrep 1.50.0.
See
https://atd.readthedocs.io/en/latest/atdgen-tutorial.html#smooth-protocol-upgrades
	  Note that the types related to the semgrep-core JSON output or the
	  semgrep-core RPC do not need to be backward compatible!
  • Loading branch information
aryx authored Feb 14, 2025
1 parent 56950a4 commit 2a998e7
Show file tree
Hide file tree
Showing 7 changed files with 4,147 additions and 3,558 deletions.
152 changes: 90 additions & 62 deletions semgrep_output_v1.atd
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
* semgrep_metrics.atd
* - The parsing stats of semgrep-core -parsing_stats -json have its own
* Parsing_stats.atd
* - The schema for the generic AST dump is in AST_generic_v1.atd
* For the definition of the Semgrep input (the rules), see rule_schema_v2.atd
*
* This file has the _v1 suffix to explicitely represent the
Expand Down Expand Up @@ -254,7 +253,7 @@ type product
<python decorator="dataclass(frozen=True)"> =
[
| SAST (* a.k.a. Code *) <json name="sast">
| SCA <json name="sca">
| SCA (* a.k.a. SSC *) <json name="sca">
| Secrets <json name="secrets">
]

Expand Down Expand Up @@ -533,8 +532,8 @@ type transitivity
*)
| Transitive <json name="transitive">
(* If there is insufficient information to determine the transitivity,
such as a requirements.txt file without a requirements.in manifest,
we leave it Unknown.
* such as a requirements.txt file without a requirements.in manifest,
* we leave it Unknown.
*)
| Unknown <json name="unknown">
]
Expand All @@ -549,7 +548,7 @@ type sca_match = {
dependency_match: dependency_match;
(* TODO: deprecate, we should use sca_match_kind instead *)
reachable: bool;
(* EXPERIMENTAL since 1.08.0 *)
(* EXPERIMENTAL since 1.108.0 *)
?kind: sca_match_kind option;
}

Expand Down Expand Up @@ -768,7 +767,7 @@ type error_type
| PatternParseError0 <json name="Pattern parse error">
| IncompatibleRule0 <json name="Incompatible rule">
(* since semgrep 1.94.0 *)
| DependencyResolutionError of resolution_error
| DependencyResolutionError of resolution_error_kind
] <ocaml repr="classic">

type incompatible_rule
Expand Down Expand Up @@ -867,11 +866,11 @@ type skip_reason <ocaml attr="deriving show"> = [
| Irrelevant_rule <json name="irrelevant_rule">
| Too_many_matches <json name="too_many_matches">
(* New in osemgrep *)
| Gitignore_patterns_match (* TODO: use JSON lowercase for consistency *)
| Gitignore_patterns_match
(* since 1.40.0. There were always ignored, but not shown in the skip report *)
| Dotfile (* TODO: use JSON lowercase for consistency *)
| Dotfile
(* since 1.44.0 *)
| Nonexistent_file (* TODO: use JSON lowercase for consistency *)
| Nonexistent_file
(* since 1.94.0 *)
| Insufficient_permissions <json name="insufficient_permissions">
] <ocaml repr="classic">
Expand Down Expand Up @@ -981,7 +980,6 @@ type target_times = {
run_time: float;
}


(*****************************************************************************)
(* Final 'semgrep scan' output *)
(*****************************************************************************)
Expand Down Expand Up @@ -1056,7 +1054,6 @@ type tests_result = {
}

type checks = {
(* would like to use fpath *)
checks: (string (* rule_id *) * rule_result) list <json repr="object">;
}

Expand Down Expand Up @@ -1218,8 +1215,8 @@ type killing_parent = {
* - /api/agent/scans/<scan_id>/complete when done, with the exit code and a
* few more information and response with app_block_override and reason
*
* alt: we could move all of this in a separate semgrep_posts_v1.atd file
* or semgrep_webapp_v1.atd
* TODO: we should move all of this in a separate semgrep_backend.atd
* (but need a proper module system for ATD first)
*)

(* ----------------------------- *)
Expand All @@ -1230,11 +1227,12 @@ type features = {
~autofix: bool;
~deepsemgrep: bool;
~dependency_query: bool;
(* a.k.a. dependency path *)
~path_to_transitivity: bool;
(* normally we resolve dependencies for changed subprojects only in diff scans. This flag
* causes all subprojects to be resolved in diff scans *)
(* normally we resolve dependencies for changed subprojects only in diff
* scans. This flag causes all subprojects to be resolved in diff scans *)
~scan_all_deps_in_diff_scan: bool;
(* Whether to collect "symbol analysis" information from the repo being scanned
(* Whether to collect "symbol analysis" info from the repo being scanned
See https://www.notion.so/semgrep/Semgrep-Code-Reconnaissance-Toolbox-18a3009241a880f2a439eed6b2cffe66?pvs=4
*)
~symbol_analysis: bool;
Expand All @@ -1260,6 +1258,7 @@ type triage_ignored = {
* Note that the version of the CLI is sent to the WebApp in
* project_metadata so the backend has all the necessary information to
* send back an appropriate action depending on the CLI version.
* TODO: only osemgrep handles that right now
*)
type action = [
| Message of string
Expand Down Expand Up @@ -1295,7 +1294,10 @@ type scan_info = {

(* config specific to the scan *)
type scan_configuration = {
rules: raw_json; (* can we type this better *)
(* Rules sent from the backend. Note that those rules are in JSON
* form not YAML (which led to some speedup in pysemgrep)
* TODO? can we type this better *)
rules: raw_json;
inherit triage_ignored;
}

Expand All @@ -1322,9 +1324,8 @@ type product_ignored_files = (product * glob list) list
* "keys must be strings" error *)
<python repr="dict"> <ts repr="map">

(* configuration for scanning version control history,
* e.g., looking back at past git commits for committed credentials which may
* have been removed *)
(* configuration for scanning version control history, e.g., looking back at
* past git commits for committed credentials which may have been removed *)
type historical_configuration = {
enabled: bool;
?lookback_days: int option;
Expand Down Expand Up @@ -1425,9 +1426,10 @@ type scan_metadata = {
cli_version: version;
unique_id: uuid; (* client generated uuid for the scan *)
requested_products: product list;
~dry_run: bool; (* from 1.47.0 *)
(* since 1.96.0 *)
(* since 1.47.0 *)
~dry_run: bool;
(* unique id associated with the scan in Semgrep Managed Scanning *)
(* since 1.96.0 *)
?sms_scan_id: string option;
}

Expand Down Expand Up @@ -1466,6 +1468,7 @@ type finding = {
message: string;
(* int|string until minimum version exceeds 1.32.0. After 1.32.0
* we're always using an int.
* TODO: switch to int now that minimum version supported is 1.50.0
* TODO: should reuse match_severity instead of using an int here.
* This is what pysemgrep is currently using:
* Error = 2, Warning = 1, Experiment = 4, otherwise 0
Expand Down Expand Up @@ -1542,7 +1545,7 @@ type ci_scan_results = {
?contributions: contributions option;
(* since semgrep 1.38.0 *)
(* this data was originally sent to /complete, but we want to start sending
* it /results *)
* it to /results *)
?dependencies: ci_scan_dependencies option;
}

Expand Down Expand Up @@ -1585,6 +1588,7 @@ type ci_scan_complete = {
stats: ci_scan_complete_stats;
(* TODO: remove when min version is 1.38.0 *)
?dependencies: ci_scan_dependencies option;
(* TODO: move the errors in ci_scan_complete_stats.errors instead *)
?dependency_parser_errors: dependency_parser_error list option;
(* since 1.31.0 *)
?task_id: string option;
Expand All @@ -1596,10 +1600,12 @@ type ci_scan_complete_stats = {
errors: cli_error list;
total_time: float;

(* ?? *)
unsupported_exts: (string * int) list
<json repr="object">
<python repr="dict">
<ts repr="map">;
(* ?? *)
lockfile_scan_info: (string * int) list
<json repr="object">
<python repr="dict">
Expand Down Expand Up @@ -1657,11 +1663,11 @@ type ci_scan_complete_response <ocaml attr="deriving show"> = {
(* ----------------------------- *)
(* SCA part 2 *)
(* ----------------------------- *)
(* key is ?? lockfile? *)
type ci_scan_dependencies = (string * found_dependency list) list
<json repr="object">
<python repr="dict">
<ts repr="map">
<json repr="object"> <python repr="dict"> <ts repr="map">

(* TODO: get rid of; should use cli_error with error_type ScaParseError *)
type dependency_parser_error = {
path: fpath;
parser: sca_parser_name;
Expand All @@ -1674,33 +1680,34 @@ type dependency_parser_error = {
?text: string option;
}

(* json names are to maintain backwards compatibility with the python enum it
* is replacing
* TODO: use <ocaml repr="classic">
(* JSON names are to maintain backwards compatibility with the python enum it
* is replacing. The P prefix (for parser) is to avoid ambiguity with similar
* construtor names in the manifest and ecosystem types.
*)
type sca_parser_name = [
| Gemfile_lock <json name="gemfile_lock">
| Go_mod <json name="go_mod">
| Go_sum <json name="go_sum">
| Gradle_lockfile <json name="gradle_lockfile">
| Gradle_build <json name="gradle_build">
| Jsondoc <json name="jsondoc">
| Pipfile <json name="pipfile">
| Pnpm_lock <json name="pnpm_lock">
| Poetry_lock <json name="poetry_lock">
| Pyproject_toml <json name="pyproject_toml">
| Requirements <json name="requirements">
| Yarn_1 <json name="yarn_1">
| Yarn_2 <json name="yarn_2">
| Pomtree <json name="pomtree">
| Cargo_parser <json name="cargo">
| Composer_lock <json name="composer_lock">
| Pubspec_lock <json name="pubspec_lock">
| Package_swift <json name="package_swift">
| Podfile_lock <json name="podfile_lock">
| Package_resolved <json name="package_resolved">
| Mix_lock <json name="mix_lock">
]
type sca_parser_name <ocaml attr="deriving show"> = [
| PGemfile_lock <json name="gemfile_lock">
| PGo_mod <json name="go_mod">
| PGo_sum <json name="go_sum">
| PGradle_lockfile <json name="gradle_lockfile">
| PGradle_build <json name="gradle_build">
| PJsondoc <json name="jsondoc">
| PPipfile <json name="pipfile">
| PPnpm_lock <json name="pnpm_lock">
| PPoetry_lock <json name="poetry_lock">
| PPyproject_toml <json name="pyproject_toml">
| PRequirements <json name="requirements">
| PYarn_1 <json name="yarn_1">
| PYarn_2 <json name="yarn_2">
| PPomtree <json name="pomtree">
| PCargo_parser <json name="cargo">
| PComposer_lock <json name="composer_lock">
| PPubspec_lock <json name="pubspec_lock">
| PPackage_swift <json name="package_swift">
| PPodfile_lock <json name="podfile_lock">
| PPackage_resolved <json name="package_resolved">
| PMix_lock <json name="mix_lock">
] <ocaml repr="classic">


type supply_chain_stats = {
subprojects_stats: subproject_stats list;
Expand Down Expand Up @@ -1744,7 +1751,9 @@ type resolution_method
<ocaml attr="deriving show">
<python decorator="dataclass(frozen=True, order=True)"> =
[
(* we parsed a lockfile that was already included in the repository *)
| LockfileParsing
(* we communicated with the package manager to resolve dependencies *)
| DynamicResolution
]

Expand All @@ -1765,8 +1774,8 @@ type ci_scan_failure = {
(* Response by the backend to the CLI to the POST api/agent/deployments/current
* Some of the information in deployment_config is now returned
* directly in scan_response (e.g., the deployment_name)
* TODO: deprecate this endpoint as it is now used only in 'semgrep login' and
* in 'semgrep show whoami' to just check whether the token is valid.
* TODO? deprecate this endpoint as it is now used only in 'semgrep login' and
* in 'semgrep show whoami' to just check whether the token is valid?
*)
type deployment_config <ocaml attr="deriving show"> = {
id : int;
Expand Down Expand Up @@ -1901,7 +1910,7 @@ type core_match <python decorator="dataclass(frozen=True)"> = {
extra: core_match_extra;
}

(* TODO: try to make it as close as possible to 'cli_match_extra' below
(* TODO: try to make it as close as possible to 'cli_match_extra'.
* See the corresponding comment in cli_match_extra for more information
* about the fields below.
*)
Expand Down Expand Up @@ -1930,9 +1939,9 @@ type core_match_extra <python decorator="dataclass(frozen=True)"> = {
}

(* See Semgrep_error_code.ml *)
(* TODO: try to make it as close as possible to 'cli_error' above, possibly
(* TODO: try to make it as close as possible to 'cli_error', possibly
* extending cli_error with more fields (but those fields must be optional
* to remain backward compatible
* to remain backward compatible)
* LATER: use a proper variant in error_type so we would need less
* of those ?xxx types below (like a ParseError should always have a location)
*)
Expand Down Expand Up @@ -2006,7 +2015,7 @@ type analyzer <ocaml attr="deriving show"> = string wrap <ocaml module="Analyzer
(* A target can either be a traditional code target (now with optional
associated lockfile) or it can be a lockfile target, which will be used to
generate lockfile-only findings.
Currently *ALL TARGETS FROM OSEMGREP AND PYSEMGREP ARE CODETARGETS*
Currently *ALL TARGETS FROM PYSEMGREP ARE CODETARGETS*
coupling: with src/target/Target.mli
*)
type target <ocaml attr="deriving show"> = [
Expand Down Expand Up @@ -2239,15 +2248,34 @@ type dependency_source
| ManifestLockfileDependencySource of (manifest * lockfile)
] <ocaml repr="classic">

type resolution_error
(* alt: sca_error_kind *)
type resolution_error_kind
<ocaml attr="deriving show">
<python decorator="dataclass(frozen=True)"> =
[
| UnsupportedManifest
| MissingRequirement of string
| ResolutionCmdFailed of resolution_cmd_failed
(* when we produce some dependency list in lockfileless scanning (by talking
* to the package manager) but fail to parse it correctly *)
| ParseDependenciesFailed of string
]
(* a lockfile parser failed
* since semgrep 1.109.0 (to replace dependency_parser_error) *)
| ScaParseError of sca_parser_name
] <ocaml repr="classic">

(* used only from pysemgrep for now
* TODO? we should merge dependency_{parser,resolution}_error with cli_error
*)
type sca_resolution_error = {
type_: resolution_error_kind;
dependency_source_file: fpath;
}
type sca_error = [
| SCAParse of dependency_parser_error
| SCAResol of sca_resolution_error
] <ocaml repr="classic">


type resolution_cmd_failed
<ocaml attr="deriving show">
Expand All @@ -2262,8 +2290,8 @@ type resolution_cmd_failed
* lockfile parsing might partially fail but recover and still produce results)
*)
type resolution_result = [
| ResolutionOk of (found_dependency list * resolution_error list)
| ResolutionError of resolution_error list
| ResolutionOk of (found_dependency list * resolution_error_kind list)
| ResolutionError of resolution_error_kind list
]

type transitive_finding = {
Expand Down
Loading

0 comments on commit 2a998e7

Please sign in to comment.