Skip to content

Commit 384f788

Browse files
committed
fix(coprocessor): persist upload tasks and mark existing ciphertexts as uploaded
- Always persist upload tasks to the ciphertext_digest table - If ciphertexts are already uploaded to S3, mark them as uploaded in the database to keep state consistent
1 parent 551f57a commit 384f788

File tree

3 files changed

+139
-99
lines changed

3 files changed

+139
-99
lines changed

coprocessor/fhevm-engine/sns-executor/src/aws_upload.rs

Lines changed: 38 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -74,15 +74,31 @@ pub(crate) async fn process_s3_uploads(
7474
Some(task) => task,
7575
None => return Ok(()),
7676
};
77-
78-
let trx = insert_and_lock(task.tenant_id, task.handle.clone(), &pool).await?;
77+
debug!("Received task, handle: {}", hex::encode(&task.handle));
78+
79+
let mut trx_lock = pool.begin().await?;
80+
task.enqueue_upload_task(&mut trx_lock).await?;
81+
82+
if let Err(err) = sqlx::query!(
83+
"SELECT * FROM ciphertext_digest
84+
WHERE handle = $2 AND tenant_id = $1 AND
85+
(ciphertext128 IS NULL OR ciphertext IS NULL)
86+
FOR UPDATE SKIP LOCKED",
87+
task.tenant_id,
88+
task.handle,
89+
)
90+
.fetch_one(trx_lock.as_mut())
91+
.await {
92+
error!("Failed to lock pending uploads {}, handle: {}", err, compact_hex(&task.handle));
93+
trx_lock.rollback().await?;
94+
continue;
95+
}
7996

8097
if !is_ready.load(Ordering::SeqCst) {
8198
// If the S3 setup is not ready, we need to wait for its ready status
8299
// before we can continue spawning uploading job
83100
info!("Upload task skipped, S3 connection still not ready");
84-
// Queue the uploading job in the database
85-
trx.commit().await?;
101+
trx_lock.commit().await?;
86102
continue;
87103
}
88104

@@ -111,7 +127,7 @@ pub(crate) async fn process_s3_uploads(
111127
// Spawn a new task to upload the ciphertexts
112128
let h = tokio::spawn(async move {
113129
let s = task.otel.child_span("upload_s3");
114-
if let Err(err) = upload_ciphertexts(trx, task, &client, &conf).await {
130+
if let Err(err) = upload_ciphertexts(trx_lock, task, &client, &conf).await {
115131
if let ExecutionError::S3TransientError(_) = err {
116132
ready_flag.store(false, Ordering::SeqCst );
117133
info!("S3 setup is not ready, due to transient error: {}", err);
@@ -146,40 +162,6 @@ pub(crate) async fn process_s3_uploads(
146162
}
147163
}
148164

149-
async fn insert_and_lock(
150-
tenant_id: i32,
151-
handle: Vec<u8>,
152-
pool: &PgPool,
153-
) -> Result<Transaction<'static, Postgres>, ExecutionError> {
154-
let mut trx = pool.begin().await?;
155-
156-
sqlx::query!(
157-
"INSERT INTO ciphertext_digest (tenant_id, handle)
158-
VALUES ($1, $2) ON CONFLICT DO NOTHING",
159-
tenant_id,
160-
&handle,
161-
)
162-
.execute(trx.as_mut())
163-
.await?;
164-
165-
trx.commit().await?;
166-
167-
let mut trx = pool.begin().await?;
168-
169-
sqlx::query!(
170-
"SELECT * FROM ciphertext_digest
171-
WHERE handle = $2 AND tenant_id = $1 AND
172-
(ciphertext128 IS NULL OR ciphertext IS NULL)
173-
FOR UPDATE SKIP LOCKED",
174-
tenant_id,
175-
handle,
176-
)
177-
.fetch_all(trx.as_mut())
178-
.await?;
179-
180-
Ok(trx)
181-
}
182-
183165
enum UploadResult {
184166
CtType128((Vec<u8>, BoxedSpan)),
185167
CtType64((Vec<u8>, BoxedSpan)),
@@ -230,6 +212,14 @@ async fn upload_ciphertexts(
230212
.send(),
231213
UploadResult::CtType128((ct128_digest.clone(), span)),
232214
));
215+
} else {
216+
info!(
217+
"ct128 already exists in S3, handle: {}, digest: {}",
218+
handle_as_hex,
219+
hex::encode(&ct128_digest)
220+
);
221+
222+
task.update_ct128_uploaded(&mut trx, ct128_digest).await?;
233223
}
234224
}
235225

@@ -261,8 +251,14 @@ async fn upload_ciphertexts(
261251
.send(),
262252
UploadResult::CtType64((ct64_digest.clone(), span)),
263253
));
254+
} else {
255+
info!(
256+
"ct64 already exists in S3, handle: {}, digest: {}",
257+
handle_as_hex,
258+
hex::encode(&ct64_digest)
259+
);
264260

265-
// TODO: Update DB
261+
task.update_ct64_uploaded(&mut trx, ct64_digest).await?;
266262
}
267263
}
268264

@@ -287,35 +283,7 @@ async fn upload_ciphertexts(
287283
telemetry::end_span_with_err(span, err.to_string());
288284
transient_error = Some(ExecutionError::S3TransientError(err.to_string()));
289285
} else {
290-
sqlx::query!(
291-
"UPDATE ciphertext_digest
292-
SET ciphertext128 = $1
293-
WHERE handle = $2",
294-
digest,
295-
task.handle
296-
)
297-
.execute(trx.as_mut())
298-
.await?;
299-
300-
// Reset ciphertext128 as the ct128 has been successfully uploaded to S3
301-
// NB: For reclaiming the disk-space in DB, we rely on auto vacuuming in
302-
// Postgres
303-
304-
sqlx::query!(
305-
"UPDATE ciphertexts
306-
SET ciphertext128 = NULL
307-
WHERE handle = $1",
308-
task.handle
309-
)
310-
.execute(trx.as_mut())
311-
.await?;
312-
313-
info!(
314-
"Uploaded ct128, handle: {}, digest: {}",
315-
handle_as_hex,
316-
compact_hex(&digest)
317-
);
318-
286+
task.update_ct128_uploaded(&mut trx, digest).await?;
319287
telemetry::end_span_with_timestamp(span, finish_time);
320288
}
321289
}
@@ -329,28 +297,13 @@ async fn upload_ciphertexts(
329297
telemetry::end_span_with_err(span, err.to_string());
330298
transient_error = Some(ExecutionError::S3TransientError(err.to_string()));
331299
} else {
332-
sqlx::query!(
333-
"UPDATE ciphertext_digest
334-
SET ciphertext = $1
335-
WHERE handle = $2",
336-
digest,
337-
task.handle
338-
)
339-
.execute(trx.as_mut())
340-
.await?;
341-
info!(
342-
"Uploaded ct64, handle: {}, digest: {}",
343-
handle_as_hex,
344-
compact_hex(&digest)
345-
);
346-
300+
task.update_ct64_uploaded(&mut trx, digest).await?;
347301
telemetry::end_span_with_timestamp(span, finish_time);
348302
}
349303
}
350304
}
351305
}
352306

353-
// TODO: Move this notify in DB query
354307
sqlx::query("SELECT pg_notify($1, '')")
355308
.bind(EVENT_CIPHERTEXTS_UPLOADED)
356309
.execute(trx.as_mut())

coprocessor/fhevm-engine/sns-executor/src/executor.rs

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -120,11 +120,13 @@ async fn fetch_and_execute_sns_tasks(
120120
}
121121
};
122122

123-
if let Some(mut tasks) = query_sns_tasks(&mut db_txn, conf.batch_limit).await? {
124-
process_tasks(&mut tasks, keys, tx)?;
125-
update_computations_status(&mut db_txn, &tasks).await?;
126-
update_ciphertext128(&mut db_txn, &tasks).await?;
127-
notify_ciphertext128_ready(&mut db_txn, &conf.notify_channel).await?;
123+
let trx = &mut db_txn;
124+
125+
if let Some(mut tasks) = query_sns_tasks(trx, conf.batch_limit).await? {
126+
process_tasks(trx, &mut tasks, keys, tx).await?;
127+
update_computations_status(trx, &tasks).await?;
128+
update_ciphertext128(trx, &tasks).await?;
129+
notify_ciphertext128_ready(trx, &conf.notify_channel).await?;
128130
db_txn.commit().await?;
129131
} else {
130132
db_txn.rollback().await?;
@@ -229,13 +231,12 @@ async fn get_remaining_tasks(
229231
}
230232

231233
/// Processes the tasks by decompressing and transforming ciphertexts.
232-
fn process_tasks(
234+
async fn process_tasks(
235+
db_txn: &mut Transaction<'_, Postgres>,
233236
tasks: &mut [HandleItem],
234237
keys: &KeySet,
235238
tx: &Sender<HandleItem>,
236239
) -> Result<(), ExecutionError> {
237-
set_server_key(keys.server_key.clone());
238-
239240
for task in tasks.iter_mut() {
240241
let ct64_compressed = task.ct64_compressed.as_ref().ok_or_else(|| {
241242
ExecutionError::MissingCiphertext128(format!(
@@ -245,6 +246,7 @@ fn process_tasks(
245246
})?;
246247

247248
let s = task.otel.child_span("decompress_ct64");
249+
set_server_key(keys.server_key.clone());
248250
let ct = decompress_ct(&task.handle, ct64_compressed)?;
249251
telemetry::end_span(s);
250252

@@ -277,7 +279,10 @@ fn process_tasks(
277279
}
278280
};
279281

280-
// Start uploading the ciphertexts sooner than later
282+
// Enqueue the task for upload in DB
283+
task.enqueue_upload_task(db_txn).await?;
284+
285+
// Start uploading the ciphertexts
281286
//
282287
// The service must continue running the squashed noise algorithm,
283288
// regardless of the availability of the upload worker.
@@ -294,8 +299,6 @@ fn process_tasks(
294299

295300
error!({target = "worker", action = "review"}, "Failed to send task to upload worker: {err}");
296301
telemetry::end_span_with_err(task.otel.child_span("send_task"), err.to_string());
297-
298-
// TODO: Insert ciphertext
299302
}
300303
}
301304

coprocessor/fhevm-engine/sns-executor/src/lib.rs

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ mod tests;
88

99
use std::time::Duration;
1010

11-
use fhevm_engine_common::{telemetry::OtelTracer, types::FhevmError};
11+
use fhevm_engine_common::{telemetry::OtelTracer, types::FhevmError, utils::compact_hex};
1212
use serde::{Deserialize, Serialize};
13+
use sqlx::{Postgres, Transaction};
1314
use thiserror::Error;
1415
use tokio::sync::mpsc::{self, Sender};
1516
use tokio_util::sync::CancellationToken;
@@ -91,6 +92,89 @@ pub struct HandleItem {
9192
pub otel: OtelTracer,
9293
}
9394

95+
impl HandleItem {
96+
/// Enqueues the upload task into the database
97+
///
98+
/// If inserted into the `ciphertext_digest` table means that the both (ct64 and ct128)
99+
/// ciphertexts are ready to be uploaded to S3.
100+
pub(crate) async fn enqueue_upload_task(
101+
&self,
102+
db_txn: &mut Transaction<'_, Postgres>,
103+
) -> Result<(), ExecutionError> {
104+
sqlx::query!(
105+
"INSERT INTO ciphertext_digest (tenant_id, handle)
106+
VALUES ($1, $2) ON CONFLICT DO NOTHING",
107+
self.tenant_id,
108+
&self.handle,
109+
)
110+
.execute(db_txn.as_mut())
111+
.await?;
112+
113+
Ok(())
114+
}
115+
116+
pub(crate) async fn update_ct128_uploaded(
117+
&self,
118+
trx: &mut Transaction<'_, Postgres>,
119+
digest: Vec<u8>,
120+
) -> Result<(), ExecutionError> {
121+
sqlx::query!(
122+
"UPDATE ciphertext_digest
123+
SET ciphertext128 = $1
124+
WHERE handle = $2",
125+
digest,
126+
self.handle
127+
)
128+
.execute(trx.as_mut())
129+
.await?;
130+
131+
// Reset ciphertext128 as the ct128 has been successfully uploaded to S3
132+
// NB: For reclaiming the disk-space in DB, we rely on auto vacuuming in
133+
// Postgres
134+
135+
sqlx::query!(
136+
"UPDATE ciphertexts
137+
SET ciphertext128 = NULL
138+
WHERE handle = $1",
139+
self.handle
140+
)
141+
.execute(trx.as_mut())
142+
.await?;
143+
144+
info!(
145+
"Mark ct128 as uploaded, handle: {}, digest: {}",
146+
compact_hex(&self.handle),
147+
compact_hex(&digest)
148+
);
149+
150+
Ok(())
151+
}
152+
153+
pub(crate) async fn update_ct64_uploaded(
154+
&self,
155+
trx: &mut Transaction<'_, Postgres>,
156+
digest: Vec<u8>,
157+
) -> Result<(), ExecutionError> {
158+
sqlx::query!(
159+
"UPDATE ciphertext_digest
160+
SET ciphertext = $1
161+
WHERE handle = $2",
162+
digest,
163+
self.handle
164+
)
165+
.execute(trx.as_mut())
166+
.await?;
167+
168+
info!(
169+
"Mark ct64 as uploaded, handle: {}, digest: {}",
170+
compact_hex(&self.handle),
171+
compact_hex(&digest)
172+
);
173+
174+
Ok(())
175+
}
176+
}
177+
94178
#[derive(Error, Debug)]
95179
pub enum ExecutionError {
96180
#[error("Conversion error: {0}")]
@@ -126,8 +210,8 @@ pub enum ExecutionError {
126210
#[error("Deserialization error: {0}")]
127211
DeserializationError(String),
128212

129-
#[error("Bucket S3 upload: {0}")]
130-
BucketNotExist(String),
213+
#[error("Bucket not found {0}")]
214+
BucketNotFound(String),
131215

132216
#[error("S3 Transient error: {0}")]
133217
S3TransientError(String),

0 commit comments

Comments
 (0)