Skip to content

Commit 609e57b

Browse files
authored
Merge pull request #8 from sangshuduo/feat/sangshuduo/random-pairs-with-s3
feat(commit): Add random_pairs_of_s3file tool
2 parents e9b2377 + d1c2765 commit 609e57b

File tree

5 files changed

+170
-0
lines changed

5 files changed

+170
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,4 @@ Cargo.lock
2020
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
2121
#.idea/
2222
.mentat
23+
.ai-commit.json

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ members = [
88
"hit_rate_converter",
99
"cat_xlsx",
1010
"find_files_in_list",
11+
"random_pairs_of_s3file"
1112
# Add other tools here
1213
]
1314
resolver = "2" # Add this line to specify resolver version 2

random_pairs_of_s3file/Cargo.toml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[package]
2+
name = "random_pairs_of_s3file"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
[dependencies]
7+
aws-config = "1.5.13"
8+
aws-sdk-s3 = "1.68.0"
9+
tokio = { version = "1.29", features = ["macros", "rt-multi-thread"] }
10+
rand = "0.8"
11+
serde = { version = "1.0", features = ["derive"] }
12+
serde_json = "1.0"
13+
clap = { version = "4.2", features = ["derive"] }

random_pairs_of_s3file/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
## random_pairs_of_s3file Usage:
2+
3+
```shell
4+
Usage: random_pairs_of_s3file [OPTIONS] --num-pairs <NUM> --bucket <BUCKET> --directory <DIR> --url-prefix <PREFIX>
5+
6+
Options:
7+
--num-pairs <NUM> Number of pairs to generate
8+
--bucket <BUCKET> Name of the S3 bucket
9+
--directory <DIR> Directory (prefix) in the bucket (e.g. "image/")
10+
--url-prefix <PREFIX> URL prefix for final URLs
11+
-h, --help Print help
12+
-V, --version Print version

random_pairs_of_s3file/src/main.rs

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
use clap::Parser;
2+
use rand::seq::SliceRandom;
3+
use serde::Serialize;
4+
use std::error::Error;
5+
6+
// AWS SDK for Rust (1.x)
7+
use aws_config::{load_defaults, BehaviorVersion};
8+
use aws_sdk_s3::error::SdkError;
9+
use aws_sdk_s3::types::Object;
10+
use aws_sdk_s3::Client;
11+
12+
/// Command-line arguments (all required, no defaults)
13+
#[derive(Parser, Debug)]
14+
#[command(author, version, about)]
15+
struct Args {
16+
/// Number of pairs to generate
17+
#[arg(long, required = true)]
18+
num_pairs: usize,
19+
20+
/// Name of the S3 bucket
21+
#[arg(long, required = true)]
22+
bucket: String,
23+
24+
/// Directory (prefix) in the bucket (e.g. "image/")
25+
#[arg(long, required = true)]
26+
directory: String,
27+
28+
/// URL prefix to form the final URL (e.g. "https://api.example.com/s3/api/v1/resource?url=s3://")
29+
#[arg(long, required = true)]
30+
url_prefix: String,
31+
}
32+
33+
#[derive(Serialize)]
34+
struct PairsOutput {
35+
pairs: Vec<Pair>,
36+
}
37+
38+
#[derive(Serialize)]
39+
struct Pair {
40+
source: String,
41+
candidate: String,
42+
}
43+
44+
#[tokio::main]
45+
async fn main() -> Result<(), Box<dyn Error>> {
46+
let args = Args::parse();
47+
48+
let num_pairs = args.num_pairs;
49+
let bucket_name = &args.bucket;
50+
let directory_prefix = &args.directory;
51+
let url_prefix = &args.url_prefix;
52+
53+
let shared_config = load_defaults(BehaviorVersion::latest()).await;
54+
let s3_client = Client::new(&shared_config);
55+
56+
let resp = s3_client
57+
.list_objects_v2()
58+
.bucket(bucket_name)
59+
.prefix(directory_prefix)
60+
.send()
61+
.await;
62+
63+
let output = match resp {
64+
Ok(o) => o,
65+
Err(SdkError::ServiceError(e)) => {
66+
eprintln!("Service error: {:#?}", e);
67+
return Ok(());
68+
}
69+
Err(e) => {
70+
eprintln!("Other error listing objects: {:?}", e);
71+
return Ok(());
72+
}
73+
};
74+
75+
// Extract all object keys
76+
let objects: &[Object] = output.contents();
77+
let all_keys: Vec<String> = objects
78+
.iter()
79+
.filter_map(|obj| obj.key().map(str::to_string))
80+
.collect();
81+
82+
if all_keys.len() < 2 {
83+
eprintln!(
84+
"Not enough objects to generate pairs. Found only {} object(s).",
85+
all_keys.len()
86+
);
87+
return Ok(());
88+
}
89+
90+
// Generate all unique pairs (source, candidate) where source != candidate
91+
let mut all_pairs = Vec::new();
92+
for (i, source) in all_keys.iter().enumerate() {
93+
// check if source is empty
94+
if source.is_empty() {
95+
continue;
96+
}
97+
for (j, candidate) in all_keys.iter().enumerate() {
98+
// check if candidate is is_empty
99+
if candidate.is_empty() {
100+
continue;
101+
}
102+
if i != j {
103+
all_pairs.push(Pair {
104+
source: format!("{}{}/{}", url_prefix, bucket_name, source),
105+
candidate: format!("{}{}/{}", url_prefix, bucket_name, candidate),
106+
});
107+
}
108+
}
109+
}
110+
111+
let max_pairs_possible = all_pairs.len();
112+
if num_pairs > max_pairs_possible {
113+
eprintln!(
114+
"Requested {} pairs, but only {} unique pairs can be generated with {} objects.",
115+
num_pairs,
116+
max_pairs_possible,
117+
all_keys.len()
118+
);
119+
}
120+
121+
// Shuffle and take the requested number of pairs
122+
let mut rng = rand::thread_rng();
123+
all_pairs.shuffle(&mut rng);
124+
125+
let selected_pairs: Vec<Pair> = all_pairs.into_iter().take(num_pairs).collect();
126+
127+
if selected_pairs.len() < num_pairs {
128+
eprintln!(
129+
"Requested {} pairs, but only {} unique pairs could be generated with {} objects.",
130+
num_pairs,
131+
selected_pairs.len(),
132+
all_keys.len()
133+
);
134+
}
135+
136+
// Print JSON output
137+
let output_json = PairsOutput {
138+
pairs: selected_pairs,
139+
};
140+
println!("{}", serde_json::to_string_pretty(&output_json)?);
141+
142+
Ok(())
143+
}

0 commit comments

Comments
 (0)