Skip to content

Commit 1636b07

Browse files
committed
preparing for v1.100.0
1 parent 31663b0 commit 1636b07

10 files changed

Lines changed: 296 additions & 88 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ logs/*
2323
!testdata/html_embedded_vulnerable.html
2424
!docs/viewer/index.html
2525
!docs-site/overrides/*.html
26+
private-notes/
2627
*.dot
2728
fuzz/*
2829
!fuzz/Cargo.toml

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ All notable changes to this project will be documented in this file.
55
## [v1.100.0]
66
- Archive scanning now reaches inside Android/iOS app packages: added `apk`, `aab`, and `ipa` to the recognized ZIP-based archive formats so secrets embedded in APK/AAB/IPA contents (e.g. `classes*.dex`, `res/values/strings.xml`) are extracted and matched.
77
- Git repository scans now extract archive blobs encountered in the object database, not just on the filesystem. Previously a `.zip`/`.jar`/`.apk`/`.tar.gz` committed to a repo was scanned as raw compressed bytes, so secrets inside it were invisible. The git enumerator fans each archive entry out as a synthetic `<archive>!<entry>` blob with the original commit metadata. Honors `--no-extract-archives` for opt-out.
8+
- Fixed tar-wrapped archive extraction for `.tgz` and `.tar.*` files, and made dependent credential validation deduplication preserve per-occurrence context so repeated secrets validate with the correct nearby companion value.
89
- Performance: ZIP-based git blobs ≤ 64 MB extract entirely in memory (no temp-file round trip), beating the v1.99.0 baseline by ~15% on a 80 GiB monorepo despite scanning ~300K additional archive-content blobs. Larger archives auto-fall-back to a disk-streaming extractor.
910
- Memory safety: hard caps on archive extraction — 64 MB compressed pre-flight, 256 MB aggregate decompressed per archive (in-memory and disk paths), 512 MB per entry, plus a `PK\x03\x04` magic-byte gate. Worst-case footprint is bounded at ~`num_jobs * 320 MB`.
1011
- Release binary trimmed from 34 MB to 26 MB (~24% smaller). Switched `jsonwebtoken` to its `rust_crypto` backend (eliminates our scanner's pull on `aws-lc-rs`), bumped workspace `hmac` 0.12→0.13, `sha1` 0.10→0.11, `sha2` 0.10→0.11 to deduplicate our internal crypto code with the AWS sigv4 side, and migrated affected call sites in `kingfisher-core`, `kingfisher-rules`, and `kingfisher-scanner` to the digest-0.11 API (`hex::encode` for hex digests, explicit `KeyInit` import for HMAC).

crates/kingfisher-rules/data/rules/aws.yml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,15 @@ rules:
55
(?x)
66
\b
77
(
8-
(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
8+
(?:A3T[A-Z0-9]|AKIA|ASIA)
99
[A-Z0-9]{16}
1010
)
1111
\b
1212
pattern_requirements:
13-
min_digits: 1
1413
ignore_if_contains:
1514
- "EXAMPLE"
1615
- "TEST"
17-
min_entropy: 3.2
16+
min_entropy: 3.0
1817
visible: false
1918
confidence: medium
2019
examples:
@@ -25,14 +24,14 @@ rules:
2524
pattern: |
2625
(?xi)
2726
(?:
28-
\b
29-
(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
27+
(?:\b|_)
28+
(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|ASIA)
3029
(?:.|[\n\r]){0,64}?
3130
[^A-Za-z0-9_+!@\#$%^&*()\]./]
3231
([A-Za-z0-9/+]{40})
3332
[^A-Za-z0-9_+!@\#$%^&*()\]./]
3433
|
35-
\b(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)
34+
(?:\b|_)(?:AWS|AMAZON|AMZN|A3T[A-Z0-9]|AKIA|ASIA)
3635
(?:.|[\n\r]){0,96}?
3736
(?:SECRET|PRIVATE|ACCESS)
3837
(?:.|[\n\r]){0,16}?

crates/kingfisher-rules/data/rules/voyageai.yml

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@ rules:
55
# Matches keys starting with 'pa-' followed by 43 URL-safe base64 characters
66
pattern: |
77
(?x)
8+
\b
89
(
910
pa-[a-zA-Z0-9\-_]{43}
1011
)
1112
\b
1213
min_entropy: 4.0
13-
confidence: high
14+
confidence: medium
1415
examples:
1516
- pa-r4yuCYCuPhNO-10Lu9aO7dR4jxUWlLmlUjm_NOVVdSs
1617
validation:
@@ -22,7 +23,56 @@ rules:
2223
headers:
2324
Authorization: "Bearer {{ TOKEN }}"
2425
response_matcher:
26+
# 200 = key has /v1/files permission, 403 = valid key without that permission
27+
# (e.g. an inference-only key). 401 with "Provided API key is invalid." is the
28+
# only response Voyage AI returns for a bad key, so any non-401 status is live.
29+
- type: StatusMatch
30+
status: [401]
31+
negative: true
32+
- type: WordMatch
33+
words:
34+
- "Provided API key is invalid"
35+
negative: true
36+
references:
37+
- https://docs.voyageai.com/reference
38+
- https://docs.voyageai.com/docs/api-key-and-installation
39+
40+
- name: Voyage AI API Key
41+
id: kingfisher.voyageai.api_key.2
42+
description: Detects Voyage AI API keys (al- prefix variant) used for embedding and retrieval models.
43+
# Matches keys starting with 'al-' followed by 43 URL-safe base64 characters
44+
pattern: |
45+
(?x)
46+
\b
47+
(
48+
al-[a-zA-Z0-9\-_]{43}
49+
)
50+
\b
51+
min_entropy: 4.0
52+
confidence: medium
53+
examples:
54+
- al-Qf7M2bZ8xnLpvE4hRcDsJtAo1KyU93WgIBmXrNVoYTu
55+
validation:
56+
type: Http
57+
content:
58+
request:
59+
method: GET
60+
url: https://api.voyageai.com/v1/files
61+
headers:
62+
Authorization: "Bearer {{ TOKEN }}"
63+
response_matcher:
64+
# 200 = key has /v1/files permission, 403 = valid key without that permission.
65+
# 401 with "Provided API key is invalid." is the only invalid-key response.
2566
- type: StatusMatch
26-
status: [200]
67+
status: [401]
68+
negative: true
69+
- type: WordMatch
70+
words:
71+
- "Provided API key is invalid"
72+
negative: true
2773
references:
28-
- https://docs.voyageai.com/reference
74+
- https://docs.voyageai.com/reference
75+
- https://docs.voyageai.com/docs/api-key-and-installation
76+
# NOTE: Revocation is not implemented because Voyage AI does not document a public REST
77+
# endpoint for programmatic API key revocation. All probed admin/key-management paths
78+
# under api.voyageai.com return 404. Keys must be revoked via the Voyage AI dashboard.

crates/kingfisher-scanner/src/validation/aws.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,9 +200,7 @@ pub fn validate_aws_credentials_input(access_key_id: &str, secret_key: &str) ->
200200
return Err("AWS access key ID contains invalid characters".to_string());
201201
}
202202
let prefix = &access_key_id[..4];
203-
let valid_prefix =
204-
matches!(prefix, "AKIA" | "AGPA" | "AIDA" | "AROA" | "AIPA" | "ANPA" | "ANVA" | "ASIA")
205-
|| prefix.starts_with("A3T");
203+
let valid_prefix = matches!(prefix, "AKIA" | "ASIA") || prefix.starts_with("A3T");
206204
if !valid_prefix {
207205
return Err("Invalid AWS access key ID format".to_string());
208206
}

docs-site/docs/changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ All notable changes to this project will be documented in this file.
1010
## [v1.100.0]
1111
- Archive scanning now reaches inside Android/iOS app packages: added `apk`, `aab`, and `ipa` to the recognized ZIP-based archive formats so secrets embedded in APK/AAB/IPA contents (e.g. `classes*.dex`, `res/values/strings.xml`) are extracted and matched.
1212
- Git repository scans now extract archive blobs encountered in the object database, not just on the filesystem. Previously a `.zip`/`.jar`/`.apk`/`.tar.gz` committed to a repo was scanned as raw compressed bytes, so secrets inside it were invisible. The git enumerator fans each archive entry out as a synthetic `<archive>!<entry>` blob with the original commit metadata. Honors `--no-extract-archives` for opt-out.
13+
- Fixed tar-wrapped archive extraction for `.tgz` and `.tar.*` files, and made dependent credential validation deduplication preserve per-occurrence context so repeated secrets validate with the correct nearby companion value.
1314
- Performance: ZIP-based git blobs ≤ 64 MB extract entirely in memory (no temp-file round trip), beating the v1.99.0 baseline by ~15% on a 80 GiB monorepo despite scanning ~300K additional archive-content blobs. Larger archives auto-fall-back to a disk-streaming extractor.
1415
- Memory safety: hard caps on archive extraction — 64 MB compressed pre-flight, 256 MB aggregate decompressed per archive (in-memory and disk paths), 512 MB per entry, plus a `PK\x03\x04` magic-byte gate. Worst-case footprint is bounded at ~`num_jobs * 320 MB`.
1516
- Release binary trimmed from 34 MB to 26 MB (~24% smaller). Switched `jsonwebtoken` to its `rust_crypto` backend (eliminates our scanner's pull on `aws-lc-rs`), bumped workspace `hmac` 0.12→0.13, `sha1` 0.10→0.11, `sha2` 0.10→0.11 to deduplicate our internal crypto code with the AWS sigv4 side, and migrated affected call sites in `kingfisher-core`, `kingfisher-rules`, and `kingfisher-scanner` to the digest-0.11 API (`hex::encode` for hex digests, explicit `KeyInit` import for HMAC).

src/decompress.rs

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,18 @@ pub const ZIP_BASED_FORMATS: &[&str] = &[
2323
"kmz", "widget", "xpi", "sketch", "pages", "key", "numbers", "hwpx",
2424
];
2525

26-
/// Break `<name>.<outer>.<inner>` into `(Some(outer), Some(inner))`.
27-
/// For `foo.tar.gz` this returns `("tar", "gz")`.
28-
fn split_extensions(path: &Path) -> (Option<String>, Option<String>) {
29-
let ext_inner = path.extension().and_then(|e| e.to_str()).map(|s| s.to_ascii_lowercase());
30-
31-
let ext_outer = path
32-
.file_stem()
33-
.and_then(|s| Path::new(s).extension())
34-
.and_then(|e| e.to_str())
35-
.map(|s| s.to_ascii_lowercase());
36-
37-
(ext_outer, ext_inner)
26+
fn is_tar_wrapped_compression(path: &Path) -> bool {
27+
let filename = match path.file_name().and_then(|s| s.to_str()) {
28+
Some(name) => name.to_ascii_lowercase(),
29+
None => return false,
30+
};
31+
32+
filename.ends_with(".tgz")
33+
|| filename.ends_with(".tar.gz")
34+
|| filename.ends_with(".tar.gzip")
35+
|| filename.ends_with(".tar.bz2")
36+
|| filename.ends_with(".tar.bzip2")
37+
|| filename.ends_with(".tar.xz")
3838
}
3939

4040
#[derive(Debug)]
@@ -450,7 +450,7 @@ fn decompress_once(path: &Path, base_dir: Option<&Path>) -> Result<CompressedCon
450450
return handle_zip_archive_streaming(&mut file, path, temp.path());
451451
}
452452
}
453-
"gz" | "gzip" => {
453+
"gz" | "gzip" | "tgz" => {
454454
let out_path = make_output_path(path, base_dir, "decomp.tar");
455455
let decoder = GzDecoder::new(BufReader::new(safe_open_for_read(path)?));
456456
return stream_to_file(decoder, &out_path);
@@ -487,12 +487,13 @@ pub fn decompress_file(path: &Path, base_dir: Option<&Path>) -> Result<Compresse
487487
let mut owned_buf: Option<PathBuf>;
488488

489489
loop {
490+
let should_extract_tar = is_tar_wrapped_compression(current_path);
490491
let content = decompress_once(current_path, base_dir)?;
491492

492493
// If the step produced a single on-disk file that is itself a .tar,
493494
// recurse on that file.
494495
if let CompressedContent::RawFile(ref p) = content {
495-
if split_extensions(p).0.as_deref() == Some("tar") {
496+
if should_extract_tar {
496497
owned_buf = Some(p.clone()); // own the path
497498
current_path = owned_buf.as_ref().unwrap();
498499
continue;
@@ -570,7 +571,7 @@ mod tests {
570571
use tempfile::tempdir;
571572
use zip::{CompressionMethod, ZipWriter, write::SimpleFileOptions};
572573

573-
use super::{CompressedContent, decompress_once};
574+
use super::{CompressedContent, decompress_file_to_temp, decompress_once};
574575

575576
/// 1) Fully unpack:
576577
/// - 1st decompress `.gz` -- get a `.tar` file
@@ -627,6 +628,45 @@ mod tests {
627628
Ok(())
628629
}
629630

631+
#[test]
632+
fn smoke_decompress_tgz_archive() -> anyhow::Result<()> {
633+
let dir = tempdir()?;
634+
let tgz = dir.path().join("payload.tgz");
635+
let github_pat = "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"; // this is not a real secret
636+
637+
{
638+
let f = File::create(&tgz)?;
639+
let gz = GzEncoder::new(f, Compression::default());
640+
let mut tar = Builder::new(gz);
641+
642+
let data = format!("token={github_pat}\n");
643+
let mut hdr = tar::Header::new_gnu();
644+
hdr.set_size(data.len() as u64);
645+
hdr.set_mode(0o644);
646+
hdr.set_cksum();
647+
tar.append_data(&mut hdr, "secret.txt", data.as_bytes())?;
648+
649+
tar.into_inner()?.finish()?;
650+
}
651+
652+
let (content, _tmp) = decompress_file_to_temp(&tgz)?;
653+
if let CompressedContent::ArchiveFiles(files) = content {
654+
let mut found = false;
655+
for (logical, path) in files {
656+
if logical.ends_with("payload.tgz!secret.txt") {
657+
let txt = std::fs::read_to_string(&path)?;
658+
assert!(txt.contains(github_pat));
659+
found = true;
660+
}
661+
}
662+
assert!(found, "did not find secret.txt in tgz ArchiveFiles");
663+
} else {
664+
panic!("expected ArchiveFiles for tgz archive, got {:?}", content);
665+
}
666+
667+
Ok(())
668+
}
669+
630670
/// 2) No-extract flag: just peel the `.gz` layer (no base_dir -- use NamedTempFile), and verify
631671
/// you get back a RawFile, whose contents are the tar archive itself.
632672
#[test]

src/scanner/validation.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,18 @@ fn build_cache_key(
983983
// For demonstration, we’ll do a simplistic approach
984984
// You can adapt from your existing logic
985985
let capture0 = om.captures.captures.get(0).map_or(String::new(), |c| c.raw_value().to_string());
986+
987+
if !om.rule.syntax().depends_on_rule.is_empty() {
988+
return format!(
989+
"{}|{}|{}|{}|{}",
990+
om.rule.name(),
991+
capture0,
992+
om.blob_id,
993+
om.matching_input_offset_span.start,
994+
om.matching_input_offset_span.end
995+
);
996+
}
997+
986998
format!("{}|{}|{}", om.rule.name(), capture0, dep_vars_str)
987999
}
9881000

0 commit comments

Comments
 (0)