Skip to content

Commit 9f9d58d

Browse files
authored
add check that docker image supports compute architecture of cluster (#2)
1 parent a56ecc7 commit 9f9d58d

11 files changed

Lines changed: 265 additions & 27 deletions

File tree

Cargo.lock

Lines changed: 95 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coman/.config/config.toml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[cscs]
2-
system = "daint"
2+
current_system = "daint"
33

44
image = "ubuntu"
55

@@ -17,3 +17,11 @@ edf_file_template = """
1717
image = "{{edf_image}}"
1818
mounts = ["${SCRATCH}:/scratch"]
1919
"""
20+
21+
[cscs.systems]
22+
23+
[cscs.systems.daint]
24+
architecture = ["arm64"]
25+
26+
[cscs.systems.eiger]
27+
architecture = ["amd64"]

coman/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ tabled = { version = "0.20.0", features = ["macros"] }
6565
nom = "8.0.0"
6666
tera = "1.20.1"
6767
inquire = "0.9.1"
68+
oci-distribution = "0.11.0"
69+
docker_credential = "1.3.2"
6870

6971
[build-dependencies]
7072
anyhow = "1.0.90"

coman/src/app/user_events.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ pub enum UserEvent {
1111
Cscs(CscsEvent),
1212
Error(String),
1313
Info(String),
14-
None, // this is mainly used to return a nop result that keeps a port alive, as returning no Event stops the port
1514
}
1615

1716
impl PartialEq for UserEvent {

coman/src/config.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#![allow(dead_code)] // Remove this once you start using the code
22

3-
use std::{env, path::PathBuf};
3+
use std::{collections::HashMap, env, path::PathBuf};
44

55
use color_eyre::Result;
66
use directories::ProjectDirs;
@@ -9,6 +9,11 @@ use serde::{Deserialize, Serialize};
99

1010
const DEFAULT_CONFIG_TOML: &str = include_str!("../.config/config.toml");
1111

12+
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
13+
pub struct SystemDescription {
14+
pub architecture: Vec<String>,
15+
}
16+
1217
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
1318
pub struct AppConfig {
1419
#[serde(default)]
@@ -20,7 +25,7 @@ pub struct AppConfig {
2025
#[derive(Clone, Debug, Serialize, Deserialize, Default)]
2126
pub struct CscsConfig {
2227
#[serde(default)]
23-
pub system: String,
28+
pub current_system: String,
2429
#[serde(default)]
2530
pub name: Option<String>,
2631
#[serde(default)]
@@ -31,7 +36,11 @@ pub struct CscsConfig {
3136
pub edf_file_template: String,
3237
#[serde(default)]
3338
pub command: Vec<String>,
39+
40+
#[serde(default)]
41+
pub systems: HashMap<String, SystemDescription>,
3442
}
43+
3544
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
3645
pub struct Config {
3746
#[serde(default, flatten)]

coman/src/cscs/api_client.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ impl From<CSCSFileSystem> for FileSystem {
7474

7575
#[derive(Debug, Eq, Clone, PartialEq, PartialOrd, Ord, Display)]
7676
pub enum JobStatus {
77+
Pending,
7778
Running,
7879
Finished,
7980
Cancelled,
@@ -96,6 +97,7 @@ impl From<JobModelOutput> for Job {
9697
"FAILED" => JobStatus::Failed,
9798
"COMPLETED" => JobStatus::Finished,
9899
"CANCELLED" => JobStatus::Cancelled,
100+
"PENDING" => JobStatus::Pending,
99101
other => panic!("got job status: {}", other),
100102
},
101103
user: value.user.unwrap_or("".to_string()),

coman/src/cscs/handlers.rs

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ use crate::{
66
cscs::{
77
api_client::{CscsApi, FileSystemType, Job, JobDetail, System},
88
oauth2::{
9-
CLIENT_ID_SECRET_NAME, CLIENT_SECRET_SECRET_NAME,
10-
client_credentials_login, finish_cscs_device_login, start_cscs_device_login,
9+
CLIENT_ID_SECRET_NAME, CLIENT_SECRET_SECRET_NAME, client_credentials_login,
10+
finish_cscs_device_login, start_cscs_device_login,
1111
},
1212
},
1313
util::{
@@ -61,17 +61,22 @@ pub async fn cscs_job_list() -> Result<Vec<Job>> {
6161
Ok(access_token) => {
6262
let api_client = CscsApi::new(access_token.0).unwrap();
6363
let config = Config::new().unwrap();
64-
api_client.list_jobs(&config.cscs.system, Some(true)).await
64+
api_client
65+
.list_jobs(&config.cscs.current_system, Some(true))
66+
.await
6567
}
6668
Err(e) => Err(e),
6769
}
6870
}
71+
6972
pub async fn cscs_job_details(job_id: i64) -> Result<Option<JobDetail>> {
7073
match get_access_token().await {
7174
Ok(access_token) => {
7275
let api_client = CscsApi::new(access_token.0).unwrap();
7376
let config = Config::new().unwrap();
74-
api_client.get_job(&config.cscs.system, job_id).await
77+
api_client
78+
.get_job(&config.cscs.current_system, job_id)
79+
.await
7580
}
7681
Err(e) => Err(e),
7782
}
@@ -86,9 +91,9 @@ pub async fn cscs_start_job(
8691
Ok(access_token) => {
8792
let api_client = CscsApi::new(access_token.0).unwrap();
8893
let config = Config::new().unwrap();
89-
let user_info = api_client.get_userinfo(&config.cscs.system).await?;
90-
let system = api_client.get_system(&config.cscs.system).await?;
91-
let scratch = match system {
94+
let user_info = api_client.get_userinfo(&config.cscs.current_system).await?;
95+
let current_system = api_client.get_system(&config.cscs.current_system).await?;
96+
let scratch = match current_system {
9297
Some(system) => PathBuf::from(
9398
system
9499
.file_systems
@@ -101,7 +106,7 @@ pub async fn cscs_start_job(
101106
None => {
102107
return Err(eyre!(
103108
"couldn't get system description for {}",
104-
config.cscs.system
109+
config.cscs.current_system
105110
));
106111
}
107112
};
@@ -113,21 +118,43 @@ pub async fn cscs_start_job(
113118
let environment_path = base_path.join("environment.toml");
114119
let environment_template = config.cscs.edf_file_template;
115120
tera.add_raw_template("environment.toml", &environment_template)?;
121+
122+
let docker_image = image.unwrap_or(config.cscs.image.try_into()?);
123+
let meta = docker_image.inspect().await?;
124+
if let Some(system_info) = config.cscs.systems.get(&config.cscs.current_system) {
125+
let mut compatible = false;
126+
for sys_platform in system_info.architecture.iter() {
127+
if meta.platforms.contains(&sys_platform.clone().into()) {
128+
compatible = true;
129+
}
130+
}
131+
132+
if !compatible {
133+
return Err(eyre!(
134+
"System {} only supports images with architecture(s) '{}' but the supplied image is for architecture(s) '{}'",
135+
config.cscs.current_system,
136+
system_info.architecture.join(","),
137+
meta.platforms
138+
.iter()
139+
.map(|p| p.to_string())
140+
.collect::<Vec<String>>()
141+
.join(",")
142+
));
143+
}
144+
}
145+
116146
let mut context = tera::Context::new();
117-
context.insert(
118-
"edf_image",
119-
&image.unwrap_or(config.cscs.image.try_into()?).to_edf(),
120-
);
147+
context.insert("edf_image", &docker_image.to_edf());
121148
let environment_file = tera.render("environment.toml", &context)?;
122149
api_client
123-
.mkdir(&config.cscs.system, base_path.clone())
150+
.mkdir(&config.cscs.current_system, base_path.clone())
124151
.await?;
125152
api_client
126-
.chmod(&config.cscs.system, base_path.clone(), "700")
153+
.chmod(&config.cscs.current_system, base_path.clone(), "700")
127154
.await?;
128155
api_client
129156
.upload(
130-
&config.cscs.system,
157+
&config.cscs.current_system,
131158
environment_path.clone(),
132159
environment_file.into_bytes(),
133160
)
@@ -150,15 +177,15 @@ pub async fn cscs_start_job(
150177
let script = tera.render("script.sh", &context)?;
151178
api_client
152179
.upload(
153-
&config.cscs.system,
180+
&config.cscs.current_system,
154181
script_path.clone(),
155182
script.into_bytes(),
156183
)
157184
.await?;
158185

159186
// start job
160187
api_client
161-
.start_job(&config.cscs.system, &name, script_path)
188+
.start_job(&config.cscs.current_system, &name, script_path)
162189
.await?;
163190
Ok(())
164191
}

0 commit comments

Comments
 (0)