perf: copy on write (#66)

supermaxiste · Stefan Milosavljevic · cmdoret · web-flow · commit d96d7f2bb805 · 2025-05-14T12:59:04.000+02:00
Co-authored-by: Stefan Milosavljevic &lt;stefan.milosavljevic@sdsc.ethz.ch&gt;
Co-authored-by: Cyril Matthey-Doret &lt;cyril.mattheydoret@gmail.com&gt;
diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
@@ -0,0 +1,40 @@
+name: Main and PR Pipeline
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+  workflow_dispatch:
+
+
+jobs:
+  benchmark:
+    if: startsWith(github.event.pull_request.title, 'perf')
+    environment: ci
+    runs-on: ubuntu-latest
+    steps:
+
+      - uses: actions/checkout@v4
+
+      - uses: ./.github/actions/setup-nix
+        with:
+          cachix_cache_name: "${{ secrets.CACHIX_CACHE_NAME }}"
+          cachix_auth_token: "${{ secrets.CACHIX_AUTH_TOKEN }}"
+
+      - name: Run benchmark
+        run: |
+          just benchmark > profiling.md
+
+      - name: Post report as PR comment
+        if: success()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          gh pr comment ${{ github.event.pull_request.number }} --body-file profiling.md
diff --git a/justfile b/justfile
@@ -16,8 +16,9 @@ develop *args:
     just nix-develop default "$@"
 
 # Enter the CI Nix development shell for benchmarking.
-develop-bench *args:
-    just nix-develop bench "$@"
+benchmark *args:
+    cd {{root_dir}} && \
+        just nix-develop bench bash ./tools/bench/benchmark.sh {{args}}
 
 # Enter the CI Nix development shell.
 ci *args:
diff --git a/src/pseudo.rs b/src/pseudo.rs
@@ -10,14 +10,15 @@ use crate::{
     index::TypeIndex,
     io,
     log::Logger,
+    model::TripleMask,
     rdf_types::*,
     rules::{match_rules, Rules},
 };
 
 // mask and encode input triple
 // NOTE: This will need the type-map to perform masking
 fn process_triple(
-    triple: Triple,
+    triple: TripleView,
     rules_config: &Rules,
     node_to_type: &mut TypeIndex,
     out: &mut impl Write,
@@ -26,8 +27,16 @@ fn process_triple(
     let mask = match_rules(&triple, rules_config, node_to_type);
 
     let r = || -> std::io::Result<()> {
-        out.write_all(hasher.pseudo_triple(&triple, mask).to_string().as_bytes())?;
-        out.write_all(b" .\n")
+        // If nothing needs to be pseudonymized, directly return triple
+        if !mask.is_set(&TripleMask::SUBJECT) & !mask.is_set(&TripleMask::OBJECT) {
+            out.write_all(triple.to_string().as_bytes())?;
+            out.write_all(b" .\n")?;
+        } else {
+            let pseudo_triple = hasher.pseudo_triple(&triple.into(), mask);
+            out.write_all(pseudo_triple.to_string().as_bytes())?;
+            out.write_all(b" .\n")?;
+        }
+        Ok(())
     }();
 
     if let Err(e) = r {
@@ -58,13 +67,7 @@ pub fn pseudonymize_graph(
     while !triples.is_end() {
         triples
             .parse_step(&mut |t: TripleView| {
-                process_triple(
-                    t.into(),
-                    &rules,
-                    &mut type_index,
-                    &mut buf_output,
-                    &pseudonymizer,
-                );
+                process_triple(t, &rules, &mut type_index, &mut buf_output, &pseudonymizer);
                 Result::<(), TurtleError>::Ok(())
             })
             .inspect_err(|e| {
diff --git a/src/rules.rs b/src/rules.rs
@@ -1,10 +1,9 @@
-use crate::{rdf_types::*, uris::*};
+use crate::{index::TypeIndex, model::TripleMask, uris::*};
 use ::std::collections::{HashMap, HashSet};
 use anyhow::{Error, Result};
+use rio_api::model::*;
 use serde::{Deserialize, Serialize};
 
-use crate::{index::TypeIndex, model::TripleMask};
-
 /// Rules for pseudonymizing nodes
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct NodeRules {
@@ -15,7 +14,7 @@ pub struct NodeRules {
 
 impl NodeRules {
     /// Validate each full URI specified in the rules for nodes
-    pub fn check_uris(&self) -> Result<(), sophia_iri::InvalidIri> {
+    pub fn check_uris(&self) -> Result<(), anyhow::Error> {
         let node_uris = keep_full_uris(&self.of_type);
         check_uris(&node_uris)
     }
@@ -56,7 +55,7 @@ pub struct ObjectRules {
 
 impl ObjectRules {
     /// Validate each full URI specified in the rules for objects
-    pub fn check_uris(&self) -> Result<(), sophia_iri::InvalidIri> {
+    pub fn check_uris(&self) -> Result<(), anyhow::Error> {
         let on_predicate_uris = keep_full_uris(&self.on_predicate);
         check_uris(&on_predicate_uris)?;
         for (k, v) in self.on_type_predicate.iter() {
@@ -242,11 +241,13 @@ pub fn match_node_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex
     let pseudo_subject = match &triple.subject {
         Subject::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
         Subject::BlankNode(_) => false,
+        Subject::Triple(_) => panic!("RDF-star data not supported"),
     };
     let pseudo_object = match &triple.object {
         Term::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
         Term::BlankNode(_) => false,
         Term::Literal(_) => false,
+        Term::Triple(_) => panic!("RDF-star data not supported"),
     };
 
     let mut mask = TripleMask::default();
@@ -279,6 +280,7 @@ pub fn match_object_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeInd
             type_map,
             rules,
         ),
+        Subject::Triple(_) => panic!("RDF-star data not supported"),
     };
 
     if pseudo_object {
@@ -532,7 +534,6 @@ mod tests {
         "
         ));
         let expanded = rules.expand_rules_curie();
-        println!("Expanded rules: {:?} ", expanded.as_ref().unwrap());
         assert!(
             expanded.unwrap().objects.on_type_predicate[expanded_rule_type]
                 .contains(expanded_rule_predicate)
diff --git a/src/uris.rs b/src/uris.rs
@@ -1,3 +1,4 @@
+use anyhow::anyhow;
 use curie::{ExpansionError, InvalidPrefixError, PrefixMapping};
 use sophia_iri::Iri;
 use std::{
@@ -149,6 +150,14 @@ pub fn check_uri(uri: &str) -> Result<Iri<&str>, sophia_iri::InvalidIri> {
     Iri::new(&uri[1..uri.len() - 2])
 }
 
+pub fn check_full_uri(uri: &str) -> Result<(), anyhow::Error> {
+    // Ensure that full URI starts with "<" and ends with ">"
+    if !(uri.starts_with('<') && uri.ends_with('>')) {
+        return Err(anyhow!("Full URI in rules must start and end with angle brackets <...>. Please format {} into <{}>", uri, uri));
+    }
+    Ok(())
+}
+
 pub fn is_full_uri(uri: &str) -> bool {
     // Ensure that full URI starts with "<" and ends with ">"
     uri.starts_with('<') && uri.ends_with('>')
@@ -173,10 +182,11 @@ pub fn keep_full_uris(hash_set: &HashSet<String>) -> HashSet<String> {
         .collect();
 }
 
-pub fn check_uris(hash_set: &HashSet<String>) -> Result<(), sophia_iri::InvalidIri> {
+pub fn check_uris(hash_set: &HashSet<String>) -> Result<(), anyhow::Error> {
     // Check if the URIs in the HashSet are valid
 
     for uri in hash_set {
+        check_full_uri(uri)?;
         check_uri(uri)?;
     }
     Ok(())
diff --git a/tools/bench/benchmark.sh b/tools/bench/benchmark.sh
@@ -3,7 +3,7 @@
 # Benchmark runtime and memory usage of tripsu
 # Compares the working directory version against a baseline branch (main by default)
 
-set -euo pipefail
+set -eu
 
 ### Final output path
 OUTPUT="profiling.md"
@@ -34,15 +34,15 @@ just build "${BUILD_ARGS[@]}"
 COMP_BIN="./target/${PROFILE}/tripsu"
 
 # setup data
-DATA_URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/proteomes.rdf.xz"
-INPUT="/tmp/proteomes.nt"
+DATA_URL="https://zenodo.org/records/7638511/files/dblp.nt.gz?download=1"
+INPUT="/tmp/dblp-sample.nt"
 
 # Download data if needed
 if [ ! -f ${INPUT} ]; then
-    curl "${DATA_URL}" |
-        xz -dc - |
-        rdfpipe-rs -i rdf-xml -o nt - \
-            >"${INPUT}" || rm "${INPUT}"
+    curl -sNL "${DATA_URL}" |
+        gzip -dc |
+        head -n 1000000 \
+            > "${INPUT}" || rm "${INPUT}"
 fi
 
 # setup config
@@ -54,16 +54,15 @@ cat <<EOF >"${RULES}"
 
 nodes:
   of_type:
-    - "http://purl.uniprot.org/core/Proteome"
-    - "http://purl.uniprot.org/core/Strain"
+    - "<https://dblp.org/rdf/schema#Informal>"
 
 objects:
   on_type_predicate:
-    "http://purl.uniprot.org/core/Submission_Citation":
-      - "http://purl.uniprot.org/core/author"
+    "<https://dblp.org/rdf/schema#Book>":
+      - "<https://dblp.org/rdf/schema#isbn>"
   
   on_predicate:
-    - "http://purl.org/dc/terms/identifier"
+    - "<https://dblp.org/rdf/schema#authoredBy>"
 
 EOF
 
@@ -93,7 +92,7 @@ mem_prof() {
     heap_out=$(mktemp)
     echo -n "$name: "
     # shellcheck disable=SC2086
-    heaptrack -o "${heap_out}" ${cmd} >/dev/null
+    heaptrack --record-only -o "${heap_out}" ${cmd} >/dev/null
     heaptrack_print "${heap_out}.zst" |
         grep '^peak heap memory'
 }