Skip to content

Commit d96d7f2

Browse files
supermaxisteStefan Milosavljeviccmdoret
authored
perf: copy on write (#66)
Co-authored-by: Stefan Milosavljevic <stefan.milosavljevic@sdsc.ethz.ch> Co-authored-by: Cyril Matthey-Doret <cyril.mattheydoret@gmail.com>
1 parent d4587d0 commit d96d7f2

6 files changed

Lines changed: 86 additions & 32 deletions

File tree

.github/workflows/benchmark.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: Main and PR Pipeline
2+
3+
concurrency:
4+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
5+
cancel-in-progress: true
6+
7+
defaults:
8+
run:
9+
shell: bash
10+
11+
on:
12+
pull_request:
13+
types: [opened, synchronize]
14+
workflow_dispatch:
15+
16+
17+
jobs:
18+
benchmark:
19+
if: startsWith(github.event.pull_request.title, 'perf')
20+
environment: ci
21+
runs-on: ubuntu-latest
22+
steps:
23+
24+
- uses: actions/checkout@v4
25+
26+
- uses: ./.github/actions/setup-nix
27+
with:
28+
cachix_cache_name: "${{ secrets.CACHIX_CACHE_NAME }}"
29+
cachix_auth_token: "${{ secrets.CACHIX_AUTH_TOKEN }}"
30+
31+
- name: Run benchmark
32+
run: |
33+
just benchmark > profiling.md
34+
35+
- name: Post report as PR comment
36+
if: success()
37+
env:
38+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
39+
run: |
40+
gh pr comment ${{ github.event.pull_request.number }} --body-file profiling.md

justfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ develop *args:
1616
just nix-develop default "$@"
1717

1818
# Enter the CI Nix development shell for benchmarking.
19-
develop-bench *args:
20-
just nix-develop bench "$@"
19+
benchmark *args:
20+
cd {{root_dir}} && \
21+
just nix-develop bench bash ./tools/bench/benchmark.sh {{args}}
2122

2223
# Enter the CI Nix development shell.
2324
ci *args:

src/pseudo.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@ use crate::{
1010
index::TypeIndex,
1111
io,
1212
log::Logger,
13+
model::TripleMask,
1314
rdf_types::*,
1415
rules::{match_rules, Rules},
1516
};
1617

1718
// mask and encode input triple
1819
// NOTE: This will need the type-map to perform masking
1920
fn process_triple(
20-
triple: Triple,
21+
triple: TripleView,
2122
rules_config: &Rules,
2223
node_to_type: &mut TypeIndex,
2324
out: &mut impl Write,
@@ -26,8 +27,16 @@ fn process_triple(
2627
let mask = match_rules(&triple, rules_config, node_to_type);
2728

2829
let r = || -> std::io::Result<()> {
29-
out.write_all(hasher.pseudo_triple(&triple, mask).to_string().as_bytes())?;
30-
out.write_all(b" .\n")
30+
// If nothing needs to be pseudonymized, directly return triple
31+
if !mask.is_set(&TripleMask::SUBJECT) & !mask.is_set(&TripleMask::OBJECT) {
32+
out.write_all(triple.to_string().as_bytes())?;
33+
out.write_all(b" .\n")?;
34+
} else {
35+
let pseudo_triple = hasher.pseudo_triple(&triple.into(), mask);
36+
out.write_all(pseudo_triple.to_string().as_bytes())?;
37+
out.write_all(b" .\n")?;
38+
}
39+
Ok(())
3140
}();
3241

3342
if let Err(e) = r {
@@ -58,13 +67,7 @@ pub fn pseudonymize_graph(
5867
while !triples.is_end() {
5968
triples
6069
.parse_step(&mut |t: TripleView| {
61-
process_triple(
62-
t.into(),
63-
&rules,
64-
&mut type_index,
65-
&mut buf_output,
66-
&pseudonymizer,
67-
);
70+
process_triple(t, &rules, &mut type_index, &mut buf_output, &pseudonymizer);
6871
Result::<(), TurtleError>::Ok(())
6972
})
7073
.inspect_err(|e| {

src/rules.rs

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
use crate::{rdf_types::*, uris::*};
1+
use crate::{index::TypeIndex, model::TripleMask, uris::*};
22
use ::std::collections::{HashMap, HashSet};
33
use anyhow::{Error, Result};
4+
use rio_api::model::*;
45
use serde::{Deserialize, Serialize};
56

6-
use crate::{index::TypeIndex, model::TripleMask};
7-
87
/// Rules for pseudonymizing nodes
98
#[derive(Serialize, Deserialize, Debug, Default)]
109
pub struct NodeRules {
@@ -15,7 +14,7 @@ pub struct NodeRules {
1514

1615
impl NodeRules {
1716
/// Validate each full URI specified in the rules for nodes
18-
pub fn check_uris(&self) -> Result<(), sophia_iri::InvalidIri> {
17+
pub fn check_uris(&self) -> Result<(), anyhow::Error> {
1918
let node_uris = keep_full_uris(&self.of_type);
2019
check_uris(&node_uris)
2120
}
@@ -56,7 +55,7 @@ pub struct ObjectRules {
5655

5756
impl ObjectRules {
5857
/// Validate each full URI specified in the rules for objects
59-
pub fn check_uris(&self) -> Result<(), sophia_iri::InvalidIri> {
58+
pub fn check_uris(&self) -> Result<(), anyhow::Error> {
6059
let on_predicate_uris = keep_full_uris(&self.on_predicate);
6160
check_uris(&on_predicate_uris)?;
6261
for (k, v) in self.on_type_predicate.iter() {
@@ -242,11 +241,13 @@ pub fn match_node_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeIndex
242241
let pseudo_subject = match &triple.subject {
243242
Subject::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
244243
Subject::BlankNode(_) => false,
244+
Subject::Triple(_) => panic!("RDF-star data not supported"),
245245
};
246246
let pseudo_object = match &triple.object {
247247
Term::NamedNode(n) => match_type(&n.to_string(), rules, type_map),
248248
Term::BlankNode(_) => false,
249249
Term::Literal(_) => false,
250+
Term::Triple(_) => panic!("RDF-star data not supported"),
250251
};
251252

252253
let mut mask = TripleMask::default();
@@ -279,6 +280,7 @@ pub fn match_object_rules(triple: &Triple, rules: &Rules, type_map: &mut TypeInd
279280
type_map,
280281
rules,
281282
),
283+
Subject::Triple(_) => panic!("RDF-star data not supported"),
282284
};
283285

284286
if pseudo_object {
@@ -532,7 +534,6 @@ mod tests {
532534
"
533535
));
534536
let expanded = rules.expand_rules_curie();
535-
println!("Expanded rules: {:?} ", expanded.as_ref().unwrap());
536537
assert!(
537538
expanded.unwrap().objects.on_type_predicate[expanded_rule_type]
538539
.contains(expanded_rule_predicate)

src/uris.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use anyhow::anyhow;
12
use curie::{ExpansionError, InvalidPrefixError, PrefixMapping};
23
use sophia_iri::Iri;
34
use std::{
@@ -149,6 +150,14 @@ pub fn check_uri(uri: &str) -> Result<Iri<&str>, sophia_iri::InvalidIri> {
149150
Iri::new(&uri[1..uri.len() - 2])
150151
}
151152

153+
pub fn check_full_uri(uri: &str) -> Result<(), anyhow::Error> {
154+
// Ensure that full URI starts with "<" and ends with ">"
155+
if !(uri.starts_with('<') && uri.ends_with('>')) {
156+
return Err(anyhow!("Full URI in rules must start and end with angle brackets <...>. Please format {} into <{}>", uri, uri));
157+
}
158+
Ok(())
159+
}
160+
152161
pub fn is_full_uri(uri: &str) -> bool {
153162
// Ensure that full URI starts with "<" and ends with ">"
154163
uri.starts_with('<') && uri.ends_with('>')
@@ -173,10 +182,11 @@ pub fn keep_full_uris(hash_set: &HashSet<String>) -> HashSet<String> {
173182
.collect();
174183
}
175184

176-
pub fn check_uris(hash_set: &HashSet<String>) -> Result<(), sophia_iri::InvalidIri> {
185+
pub fn check_uris(hash_set: &HashSet<String>) -> Result<(), anyhow::Error> {
177186
// Check if the URIs in the HashSet are valid
178187

179188
for uri in hash_set {
189+
check_full_uri(uri)?;
180190
check_uri(uri)?;
181191
}
182192
Ok(())

tools/bench/benchmark.sh

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Benchmark runtime and memory usage of tripsu
44
# Compares the working directory version against a baseline branch (main by default)
55

6-
set -euo pipefail
6+
set -eu
77

88
### Final output path
99
OUTPUT="profiling.md"
@@ -34,15 +34,15 @@ just build "${BUILD_ARGS[@]}"
3434
COMP_BIN="./target/${PROFILE}/tripsu"
3535

3636
# setup data
37-
DATA_URL="https://ftp.uniprot.org/pub/databases/uniprot/current_release/rdf/proteomes.rdf.xz"
38-
INPUT="/tmp/proteomes.nt"
37+
DATA_URL="https://zenodo.org/records/7638511/files/dblp.nt.gz?download=1"
38+
INPUT="/tmp/dblp-sample.nt"
3939

4040
# Download data if needed
4141
if [ ! -f ${INPUT} ]; then
42-
curl "${DATA_URL}" |
43-
xz -dc - |
44-
rdfpipe-rs -i rdf-xml -o nt - \
45-
>"${INPUT}" || rm "${INPUT}"
42+
curl -sNL "${DATA_URL}" |
43+
gzip -dc |
44+
head -n 1000000 \
45+
> "${INPUT}" || rm "${INPUT}"
4646
fi
4747

4848
# setup config
@@ -54,16 +54,15 @@ cat <<EOF >"${RULES}"
5454
5555
nodes:
5656
of_type:
57-
- "http://purl.uniprot.org/core/Proteome"
58-
- "http://purl.uniprot.org/core/Strain"
57+
- "<https://dblp.org/rdf/schema#Informal>"
5958
6059
objects:
6160
on_type_predicate:
62-
"http://purl.uniprot.org/core/Submission_Citation":
63-
- "http://purl.uniprot.org/core/author"
61+
"<https://dblp.org/rdf/schema#Book>":
62+
- "<https://dblp.org/rdf/schema#isbn>"
6463
6564
on_predicate:
66-
- "http://purl.org/dc/terms/identifier"
65+
- "<https://dblp.org/rdf/schema#authoredBy>"
6766
6867
EOF
6968

@@ -93,7 +92,7 @@ mem_prof() {
9392
heap_out=$(mktemp)
9493
echo -n "$name: "
9594
# shellcheck disable=SC2086
96-
heaptrack -o "${heap_out}" ${cmd} >/dev/null
95+
heaptrack --record-only -o "${heap_out}" ${cmd} >/dev/null
9796
heaptrack_print "${heap_out}.zst" |
9897
grep '^peak heap memory'
9998
}

0 commit comments

Comments
 (0)