-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsemgrep_metrics.atd
283 lines (250 loc) · 10.9 KB
/
semgrep_metrics.atd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
(* Semgrep metrics.
*
* The goal of this module is to specify the schema for the metrics
* data sent by the Semgrep CLI to https://metrics.semgrep.dev.
*
* coupling: with semgrep/PRIVACY.md where we carefully document
* all the metrics we send with a big table explaining each field.
*
* translated originally from semgrep/cli/.../metrics.py
*
* TODO:
* - some optional arguments could be removed when `osemgrep` replace
* `pysemgrep`. Currently, we try to fit the tests and don't update the
* JSON output.
* - specify the schema for the answer from https://metrics.semgrep.dev
* - document also what we send via open telemetry in our traces when
* using --trace?
*)
<python text="from dataclasses import field">
(*****************************************************************************)
(* Basic types *)
(*****************************************************************************)
type uuid = string wrap <ocaml module="ATD_string_wrap.Uuidm">
type sha256 = string wrap <ocaml module="ATD_string_wrap.Sha256">
(* RFC 3339 formatted datetime *)
type datetime = string wrap <ocaml module="ATD_string_wrap.Datetime">
(* ideally this would be the same than in Language.ml *)
type lang = string
(* Note that there is no 'fpath' type here. We don't send concrete filenames
* on purpose (see PRIVACY.md).
*)
(*****************************************************************************)
(* Entry point *)
(*****************************************************************************)
(* In addition to the fields below, some code in the semgrep-app-lambdas repo
* in metrics-handler/prod/index.js adds a few fields in the payload such as:
*
* clientIP: string; (e.g., "1.2.3.4")
* userAgent: string; (e.g., "Semgrep/1.37.0 (Docker) (command/ci)")
*
* note: the use of 'mutable' below is a bit unusual, but many metrics fields
* are adjusted in different modules and functions in Semgrep so it was simpler
* to use a global in Metrics_.ml and mutable fields.
* See the comments in Metrics_.ml about the global 'g' for more information.
*)
type payload = {
(* required event and timing *)
event_id <ocaml mutable>: uuid;
started_at <python default="Datetime('')"> <ocaml mutable>: datetime;
sent_at <python default="Datetime('')"> <ocaml mutable>: datetime;
(* as stored in ~/.semgrep/settings.yml *)
anonymous_user_id <python default="''"> <ocaml mutable>: string;
(* The fields order below follows the presentation order in PRIVACY.md *)
environment: environment;
performance: performance;
(* the string is a 'lang' below but we can't use the alias because this
* confuses ATD which is checking for a 'string' because of the JSON
* repr annotation coming after.
* TODO? should we move under performance instead of at the top here?
*)
~parse_rate <ocaml mutable>: (string (* lang *) * parse_stat) list
<json repr="object">;
errors: errors;
value: value;
extension: extension;
}
(*****************************************************************************)
(* Environment *)
(*****************************************************************************)
type environment = {
(* Semgrep CLI version (e.g., "1.45.0") *)
version <ocaml mutable>: string;
(* Since Semgrep 1.59 *)
os: string; (* e.g. "linux" *)
(* Since Semgrep 1.59 *)
isTranspiledJS <ocaml mutable>: bool;
projectHash <ocaml mutable>: sha256 nullable;
configNamesHash <python default="Sha256hash('')"> <ocaml mutable>: sha256;
?rulesHash <ocaml mutable>: sha256 option;
(* TODO: should be just boolean *)
ci <ocaml mutable>: string nullable;
(* indicates whether `--baseline-commit` option has been specified. When
* `baseline_commit` option is set, it implies that the scan is operating
* in differential scan mode.
* Since semgrep 1.46
*)
~isDiffScan <python default="False"> <ocaml mutable> : bool;
?integrationName <ocaml mutable>: string option;
~isAuthenticated <python default="False"> <ocaml mutable>: bool;
?deployment_id <ocaml mutable>: int option;
}
(*****************************************************************************)
(* Performance *)
(*****************************************************************************)
type performance = {
?numRules <ocaml mutable>: int option;
?numTargets <ocaml mutable>: int option;
?totalBytesScanned <ocaml mutable>: int option;
?fileStats <ocaml mutable>: file_stats list option;
?ruleStats <ocaml mutable>: rule_stats list option;
?profilingTimes <ocaml mutable>: (string * float) list <json repr="object"> option;
?maxMemoryBytes <ocaml mutable>: int option;
}
type file_stats = {
size <ocaml mutable>: int;
numTimesScanned <ocaml mutable>: int;
(* LATER: factorize with semgrep_output_v1.atd? *)
?parseTime <ocaml mutable>: float option;
?matchTime <ocaml mutable>: float option;
?runTime <ocaml mutable>: float option;
}
type rule_stats = {
ruleHash <ocaml mutable>: string;
bytesScanned <ocaml mutable>: int;
?matchTime <ocaml mutable>: float option;
}
(*****************************************************************************)
(* Parsing stats *)
(*****************************************************************************)
type parse_stat = {
targets_parsed <ocaml mutable>: int;
num_targets <ocaml mutable>: int;
bytes_parsed <ocaml mutable>: int;
num_bytes <ocaml mutable>: int;
}
(*****************************************************************************)
(* Errors *)
(*****************************************************************************)
type errors = {
?returnCode <ocaml mutable>: int option;
?errors <ocaml mutable>: error list option;
}
(* This is mostly semgrep_output_v1.error_type but in a string form
* thanks to Error.string_of_error_type.
* TODO: use directly semgrep_output_v1.error_type but need ATD modules.
*)
type error = string
(*****************************************************************************)
(* Value ("data that indicate how useful a run is for the end user") *)
(*****************************************************************************)
type value = {
(* The string has the form "key/value" and is used to encode lots
* of information such as:
* - "subcommand/ci" (the subcommand is also part of the userAgent)
* - "cli-flag/strict", "cli-flag/output"
* - "cli-envvar/??"
* - "cli-prompt/??"
* - "config/local", "config/auto"
* - "output/path", "output/url"
* - "semgrepignore/exclude"
* - "language/python", "language/<multilang"
* - "registry-query/??"
* - "ruleset/??"
* coupling: features is commented a lot in semgrep/PRIVACY.md
*)
features <ocaml mutable>: string list;
(* Since semgrep 1.46 *)
?proFeatures <ocaml mutable>: pro_features option;
(* most important metric for the end user *)
?numFindings <ocaml mutable>: int option;
(* Since Semgrep 1.56.0, see PA-3312 *)
?numFindingsByProduct <ocaml mutable>: (string * int) list <json repr="object"> option;
?numIgnored <ocaml mutable>: int option;
?ruleHashesWithFindings <ocaml mutable>: (string * int) list <json repr="object"> option;
(* TODO: should be OSS | Pro, see semgrep_output_v1.atd engine_kind type *)
~engineRequested <python default="'OSS'"> <ocaml mutable>: string;
(* Since Semgrep 1.54.0 *)
?engineConfig <ocaml mutable>: engine_config option;
(* Since Semgrep 1.49.0 *)
?interfileLanguagesUsed: string list option;
}
type pro_features = {
(* The value is determined by the `--diff-depth` option, and it's utilized
* for inter-file differential scanning.
* Since semgrep 1.46
*)
?diffDepth <ocaml mutable>: int option;
(* The number of scanned files per language for inter-file diff scan mode.
* This number represents the count of changed files and their limited
* dependencies.
* Since semgrep 1.70
*)
?numInterfileDiffScanned <ocaml mutable>: (string (* lang *) * int) list
<json repr="object"> option;
}
(* Since v1.55.0 *)
type engine_config <ocaml attr="deriving show"> = {
analysis_type: analysis_type;
pro_langs: bool;
(* `Some c` where `c` is the config if the product was run.
* `None` if it was not run.
*)
?code_config: code_config option;
?secrets_config: secrets_config option;
?supply_chain_config: supply_chain_config option;
}
type analysis_type <ocaml attr="deriving show"> = [
| Intraprocedural
| Interprocedural
| Interfile
]
(* Ideally this and supply_chain_config would be unit or empty records, but
* ATD doesn't generate correct code for Python when unit is aliased
* (to_json/from_json are wrong and try to call methods on a fake Unit class).
* In the alternative, {} doesn't work either since ATD just passes it through
* instead of converting to unit and it is not valid OCaml syntax.
*
* So we have a "reserved for future use" (RFU) field just to act as a
* placeholder.
*)
type code_config <ocaml attr="deriving show"> = { ?_rfu: int option; }
type secrets_origin <ocaml attr="deriving show"> = [ Any | Semgrep | NoCommunity ]
type secrets_config
<ocaml attr="deriving show"> = {
permitted_origins: secrets_origin;
}
type supply_chain_config <ocaml attr="deriving show"> = { ?_rfu: int option; }
(*****************************************************************************)
(* Extension *)
(*****************************************************************************)
(* Metrics from our IDE extensions? *)
type extension = {
(* unique ID provided by the integration *)
?machineId <ocaml mutable>: string option;
(* if the integration was just installed *)
?isNewAppInstall <ocaml mutable>: bool option;
(* tracks unique sessions of the integration *)
?sessionId <ocaml mutable>: string option;
(* version of the integration. NOT semgrep version *)
?version <ocaml mutable>: string option;
(* type of extension, i.e. vscode, or intellij *)
?ty <ocaml mutable>: string option;
(* number of autofixes accepted via code actions *)
?autofixCount <ocaml mutable>: int option;
(* how many findings have been ignored *)
?ignoreCount <ocaml mutable>: int option;
}
(*****************************************************************************)
(* TODO Response by metrics.semgrep.dev *)
(*****************************************************************************)
(* Here is an example of answer when an error occured:
* { "errorType":"TypeError",
* "errorMessage":"Cannot read property 'map' of undefined",
* "trace":[
* "TypeError: Cannot read property 'map' of undefined",
* " at createPerRuleObjects (/var/task/index.js:287:24)",
* " at Runtime.exports.handler (/var/task/index.js:363:20)",
* ]
* }
*)