public: Fix postgresql transaction panic.

tbjuhasz · tbjuhasz · commit 029b019a6497 · 2023-08-21T18:16:46.000Z
This corrects a panic triggered by a particular PostgreSQL transaction. The transaction first added a new record to a table and then immediately deleted all records from that same table. The panic was induced by the remove operation on RocksDB. This operation checks if a record exists in RocksDB before queuing the removal operation in the RocksDB WriteBatch object. However, since the insertion and deletion of a specific record were part of the same transaction, the record was never actually written to RocksDB. This change fixes it by removing records from the recordset that would negate each other. This way there won't be a record that has not been written to RocksDB that later needs to be removed. Change-Id: I1270a3da00d352243116d62c237a4787c4d850f8 Reviewed-on: https://gerrit.readyset.name/c/readyset/+/5801 Tested-by: Buildkite CI Reviewed-by: Tamas Juhasz <tamas@readyset.io> Reviewed-by: Luke Osborne <luke@readyset.io>
diff --git a/dataflow-state/src/persistent_state.rs b/dataflow-state/src/persistent_state.rs
@@ -643,6 +643,10 @@ impl State for PersistentState {
         replication_offset: Option<ReplicationOffset>,
     ) -> ReadySetResult<()> {
         invariant!(partial_tag.is_none(), "PersistentState can't be partial");
+
+        // Streamline the records by eliminating pairs that would negate each other.
+        records.remove_deleted();
+
         if records.len() == 0 && replication_offset.is_none() {
             return Ok(());
         }
@@ -666,47 +670,7 @@ impl State for PersistentState {
                 }
             }
         }
-
-        let mut opts = rocksdb::WriteOptions::default();
-        if self.snapshot_mode.is_enabled()
-            // if we're setting the replication offset, that means we've snapshot the full table, so
-            // set sync to true there even if snapshot_mode is enabled, to make sure that makes it
-            // onto disk (not doing this *will* cause the write to get lost if the server restarts!)
-            && replication_offset.is_none()
-        {
-            opts.disable_wal(true);
-        } else {
-            let db = &self.db.handle();
-            if self.snapshot_mode.is_enabled() && replication_offset.is_some() {
-                // We are setting the replication offset, which is great, but all of our previous
-                // writes are not guaranteed to flush to disk even if the next write is synced. We
-                // therefore perform a flush before handling the next write.
-                //
-                // See: https://github.com/facebook/rocksdb/wiki/RocksDB-FAQhttps://github.com/facebook/rocksdb/wiki/RocksDB-FAQ
-                // Q: After a write following option.disableWAL=true, I write another record with
-                // options.sync=true,    will it persist the previous write too?
-                // A: No. After the program crashes, writes with option.disableWAL=true will be
-                // lost, if they are not flushed to SST files.
-                for index in self.db.inner().indices.iter() {
-                    db.flush_cf(db.cf_handle(&index.column_family).unwrap())
-                        .map_err(|e| internal_err!("Flush to disk failed: {e}"))?;
-                }
-
-                db.flush()
-                    .map_err(|e| internal_err!("Flush to disk failed: {e}"))?;
-            }
-            opts.set_sync(true);
-        }
-
-        if let Some(offset) = replication_offset {
-            self.set_replication_offset(&mut batch, offset);
-        }
-
-        self.db
-            .handle()
-            .write_opt(batch, &opts)
-            .map_err(|e| internal_err!("Write failed: {e}"))?;
-
+        self.write_to_db(batch, &replication_offset)?;
         Ok(())
     }
 
@@ -1964,6 +1928,57 @@ impl PersistentState {
     ) -> Vec<RecordResult<'a>> {
         self.db.lookup_multi(columns, keys)
     }
+
+    /// Takes the provided batch and optionally a replication offset and writes to the RocksDB
+    /// database.
+    fn write_to_db(
+        &mut self,
+        batch: WriteBatch,
+        replication_offset: &Option<ReplicationOffset>,
+    ) -> ReadySetResult<()> {
+        let mut batch = batch;
+        let mut write_options = rocksdb::WriteOptions::default();
+        if self.snapshot_mode.is_enabled()
+            // if we're setting the replication offset, that means we've snapshot the full table, so
+            // set sync to true there even if snapshot_mode is enabled, to make sure that makes it
+            // onto disk (not doing this *will* cause the write to get lost if the server restarts!)
+            && replication_offset.is_none()
+        {
+            write_options.disable_wal(true);
+        } else {
+            let db = &self.db.handle();
+            if self.snapshot_mode.is_enabled() && replication_offset.is_some() {
+                // We are setting the replication offset, which is great, but all of our previous
+                // writes are not guaranteed to flush to disk even if the next write is synced. We
+                // therefore perform a flush before handling the next write.
+                //
+                // See: https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ
+                // Q: After a write following option.disableWAL=true, I write another record with
+                // options.sync=true,    will it persist the previous write too?
+                // A: No. After the program crashes, writes with option.disableWAL=true will be
+                // lost, if they are not flushed to SST files.
+                for index in self.db.inner().indices.iter() {
+                    db.flush_cf(db.cf_handle(&index.column_family).unwrap())
+                        .map_err(|e| internal_err!("Flush to disk failed: {e}"))?;
+                }
+
+                db.flush()
+                    .map_err(|e| internal_err!("Flush to disk failed: {e}"))?;
+            }
+            write_options.set_sync(true);
+        }
+
+        if let Some(offset) = replication_offset {
+            self.set_replication_offset(&mut batch, offset.clone());
+        }
+
+        self.db
+            .handle()
+            .write_opt(batch, &write_options)
+            .map_err(|e| internal_err!("Write failed: {e}"))?;
+
+        Ok(())
+    }
 }
 
 /// Checks if the given index is unique for this base table.
@@ -2264,6 +2279,20 @@ mod tests {
         }
     }
 
+    #[test]
+    fn persistent_state_add_remove_same_record() {
+        let mut state = setup_persistent("persistent_state_multiple_indices", None);
+        let first: Vec<DfValue> = vec![10.into(), "Cat".into(), 1.into()];
+        let second: Vec<DfValue> = vec![10.into(), "Cat".into(), 1.into()];
+        let mut records: Records = Default::default();
+        records.push(Record::Positive(first));
+        records.push(Record::Negative(second));
+
+        state.add_key(Index::new(IndexType::HashMap, vec![0]), None);
+        state.add_key(Index::new(IndexType::HashMap, vec![1, 2]), None);
+        state.process_records(&mut records, None, None).unwrap();
+    }
+
     #[test]
     fn empty_column_set() {
         let mut state = setup_persistent("empty_column_set", None);
diff --git a/readyset-common/src/records.rs b/readyset-common/src/records.rs
@@ -148,6 +148,28 @@ impl Records {
     {
         self.has(q, false)
     }
+
+    // This function checks every Negative record and ensures that there isn't a Positive record
+    // before it that matches its content. If there is, then both the Negative and Positive
+    // records are removed. This will prevent unnecessary writes to RocksDB.
+    pub fn remove_deleted(&mut self) {
+        let mut i = 0;
+        while i < self.0.len() {
+            if let Record::Negative(val) = &self.0[i] {
+                for j in (0..i).rev() {
+                    if let Record::Positive(pos_val) = &self.0[j] {
+                        if pos_val == val {
+                            self.0.remove(j);
+                            i -= 1;
+                            self.0.remove(i); // index decreased due to previous removal
+                            break;
+                        }
+                    }
+                }
+            }
+            i += 1;
+        }
+    }
 }
 
 impl Deref for Records {
@@ -200,4 +222,55 @@ mod tests {
             Record::Positive(vec![1.into(), 2.into()]) < Record::Negative(vec![1.into(), 2.into()])
         )
     }
+
+    // Transactions sometimes include records that negate each other. The following test
+    // ensures that the simplify function handles them correctly.
+    #[test]
+    fn test_simplify() {
+        let mut records: Records = vec![
+            Record::Positive(vec![1.into(), "2".into(), 3.into()]),
+            Record::Negative(vec![1.into(), "2".into(), 3.into()]),
+            Record::Positive(vec![4.into(), "5".into(), 6.into()]),
+            Record::Negative(vec![4.into(), "5".into(), 6.into()]),
+            Record::Positive(vec!["last".into(), 8.into(), 9.into()]),
+        ]
+        .into();
+
+        records.remove_deleted();
+
+        let mut result: Records =
+            vec![Record::Positive(vec!["last".into(), 8.into(), 9.into()])].into();
+
+        assert_eq!(records, result);
+
+        records = vec![
+            Record::Positive(vec![1.into(), "2".into(), 3.into()]),
+            Record::Negative(vec![9.into(), "2".into(), 3.into()]),
+            Record::Positive(vec![7.into(), "5".into(), 6.into()]),
+            Record::Negative(vec![1.into(), "2".into(), 3.into()]),
+            Record::Positive(vec!["last".into(), 8.into(), 9.into()]),
+        ]
+        .into();
+
+        records.remove_deleted();
+
+        result = vec![
+            Record::Negative(vec![9.into(), "2".into(), 3.into()]),
+            Record::Positive(vec![7.into(), "5".into(), 6.into()]),
+            Record::Positive(vec!["last".into(), 8.into(), 9.into()]),
+        ]
+        .into();
+
+        assert_eq!(records, result);
+
+        records = vec![
+            Record::Positive(vec![1.into(), "2".into(), 3.into()]),
+            Record::Negative(vec![1.into(), "2".into(), 3.into()]),
+        ]
+        .into();
+
+        records.remove_deleted();
+
+        assert!(records.is_empty());
+    }
 }
diff --git a/readyset-psql/tests/fallback.rs b/readyset-psql/tests/fallback.rs
@@ -1406,3 +1406,35 @@ async fn show_proxied_queries_show_caches_query_text_matches() {
 
     shutdown_tx.shutdown().await;
 }
+
+#[tokio::test(flavor = "multi_thread")]
+#[serial]
+async fn insert_delete_a_record_in_the_same_transaction() {
+    readyset_tracing::init_test_logging();
+    let (config, _handle, shutdown_tx) = setup().await;
+    let mut client = connect(config).await;
+    client.simple_query("create table t(a int)").await.unwrap();
+    {
+        let transaction = client.transaction().await.unwrap();
+        // Begin transaction
+        transaction.batch_execute("BEGIN").await.unwrap();
+
+        // Value to be inserted
+        let val = 1;
+
+        transaction
+            .execute("INSERT INTO t VALUES($1)", &[&val])
+            .await
+            .unwrap();
+        transaction.execute("delete from t", &[]).await.unwrap();
+
+        // Commit the transaction
+        transaction.batch_execute("COMMIT").await.unwrap();
+    }
+
+    // Check if all the records have been deleted
+    let rows = client.query("SELECT COUNT(*) FROM t", &[]).await.unwrap();
+    let count: i64 = rows[0].get(0);
+    assert_eq!(count, 0);
+    shutdown_tx.shutdown().await;
+}