Skip to content

Commit 9aa8ea6

Browse files
committed
use content hash
1 parent 1fbf636 commit 9aa8ea6

File tree

6 files changed

+94
-29
lines changed

6 files changed

+94
-29
lines changed

Cargo.lock

Lines changed: 32 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ oxigraph = "0.5"
3838
memmap2 = "0.9"
3939
zstd = "0.13"
4040
rand = "0.9"
41+
blake3 = "1.5"
4142

4243
rdf5d = { version = "0.4.1-a1", path = "rdf5d", features = ["oxigraph", "zstd"] }
4344

lib/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ tempdir = "0.3.7"
3333
pretty-bytes = "0.2.2"
3434
fs2 = "0.4"
3535
url = "2.5"
36+
blake3.workspace = true

lib/src/api.rs

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@ use chrono::prelude::*;
1515
use oxigraph::model::{Dataset, Graph, NamedNode, NamedNodeRef, NamedOrBlankNodeRef, TripleRef};
1616
use oxigraph::store::Store;
1717
use petgraph::visit::EdgeRef;
18-
use std::io::{BufReader, Write};
18+
use std::fs::File;
19+
use std::io::{BufReader, Read, Write};
1920
use std::path::Path;
2021
use std::path::PathBuf;
2122

2223
use crate::io::GraphIO;
2324
use crate::ontology::{GraphIdentifier, Ontology, OntologyLocation};
2425
use anyhow::{anyhow, Result};
26+
use blake3;
2527
use log::{debug, error, info, warn};
2628
use petgraph::graph::{Graph as DiGraph, NodeIndex};
2729
use std::collections::{HashMap, HashSet, VecDeque};
@@ -867,48 +869,47 @@ impl OntoEnv {
867869
}
868870
};
869871

870-
// if the source modified is missing, then we assume it has been updated
871-
let source_modified = self
872-
.io
873-
.source_last_modified(ontology.id())
874-
.unwrap_or(Utc::now());
875-
// if the ontology has no modified time, then we assume it has never been updated
876872
let last_updated = ontology
877873
.last_updated
878874
.unwrap_or(Utc.timestamp_opt(0, 0).unwrap());
879875

880-
if source_modified > last_updated {
881-
if let OntologyLocation::File(path) = location {
882-
// Mtime is newer, so now check if content is different
883-
let new_graph = match self.io.read_file(path) {
884-
Ok(g) => g,
876+
match location {
877+
OntologyLocation::File(path) => {
878+
// Prefer a fast content hash comparison to avoid mtime granularity issues.
879+
let current_hash = match hash_file(path) {
880+
Ok(h) => h,
885881
Err(e) => {
886882
warn!(
887-
"Could not read file for update check {}: {}",
883+
"Could not hash file for update check {}: {}",
888884
path.display(),
889885
e
890886
);
891-
return true; // If we can't read it, assume it's updated
887+
return true; // assume updated if we cannot hash
892888
}
893889
};
894-
let old_graph = match self.io.get_graph(ontology.id()) {
895-
Ok(g) => g,
896-
Err(e) => {
897-
warn!(
898-
"Could not get graph from store for update check {}: {}",
899-
ontology.id(),
900-
e
901-
);
902-
return true; // If we can't get the old one, assume updated
890+
891+
if let Some(stored_hash) = ontology.content_hash() {
892+
if stored_hash == current_hash {
893+
return false;
903894
}
904-
};
905-
return new_graph != old_graph;
895+
return true;
896+
}
897+
898+
// Fallback to mtime when legacy records lack a stored hash.
899+
let source_modified = self
900+
.io
901+
.source_last_modified(ontology.id())
902+
.unwrap_or(Utc::now());
903+
source_modified > last_updated
904+
}
905+
_ => {
906+
let source_modified = self
907+
.io
908+
.source_last_modified(ontology.id())
909+
.unwrap_or(Utc::now());
910+
source_modified > last_updated
906911
}
907-
// For non-file locations, we can't easily check content, so stick with mtime.
908-
return true;
909912
}
910-
911-
false
912913
})
913914
.map(|(graphid, _)| graphid.clone())
914915
.collect()
@@ -1792,6 +1793,21 @@ impl OntoEnv {
17921793
}
17931794
}
17941795

1796+
fn hash_file(path: &Path) -> Result<String> {
1797+
let file = File::open(path)?;
1798+
let mut reader = BufReader::new(file);
1799+
let mut hasher = blake3::Hasher::new();
1800+
let mut buf = [0u8; 8192];
1801+
loop {
1802+
let n = reader.read(&mut buf)?;
1803+
if n == 0 {
1804+
break;
1805+
}
1806+
hasher.update(&buf[..n]);
1807+
}
1808+
Ok(hasher.finalize().to_hex().to_string())
1809+
}
1810+
17951811
#[cfg(test)]
17961812
mod tests {
17971813
use super::*;

lib/src/io.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crate::ontology::{GraphIdentifier, Ontology, OntologyLocation};
66
use crate::options::Overwrite;
77
use crate::util::get_file_contents;
88
use anyhow::{anyhow, Error, Result};
9+
use blake3;
910
use chrono::prelude::*;
1011
use fs2::FileExt;
1112
use log::{error, info};
@@ -63,6 +64,8 @@ fn add_ontology_bytes(
6364
let tmp_store = load_staging_store_from_bytes(bytes, format)?;
6465
let staging_id = GraphIdentifier::new_with_location(staging_graph.as_ref(), location.clone());
6566
let mut ontology = Ontology::from_store(&tmp_store, &staging_id, strict)?;
67+
let hash = blake3::hash(bytes).to_hex().to_string();
68+
ontology.set_content_hash(hash);
6669
ontology.with_last_updated(Utc::now());
6770
let id = ontology.id();
6871
let graphname: GraphName = id.graphname()?;

lib/src/ontology.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,8 @@ pub struct Ontology {
334334
version_properties: HashMap<NamedNode, String>,
335335
#[serde(default)]
336336
namespace_map: HashMap<String, String>,
337+
#[serde(default)]
338+
content_hash: Option<String>,
337339
}
338340

339341
// impl display; name + location + last updated, then indented version properties
@@ -366,6 +368,7 @@ impl Default for Ontology {
366368
last_updated: None,
367369
version_properties: HashMap::new(),
368370
namespace_map: HashMap::new(),
371+
content_hash: None,
369372
}
370373
}
371374
}
@@ -382,6 +385,14 @@ impl Ontology {
382385
self.location = Some(location);
383386
}
384387

388+
pub fn set_content_hash(&mut self, hash: String) {
389+
self.content_hash = Some(hash);
390+
}
391+
392+
pub fn content_hash(&self) -> Option<&str> {
393+
self.content_hash.as_deref()
394+
}
395+
385396
pub fn id(&self) -> &GraphIdentifier {
386397
&self.id
387398
}
@@ -610,6 +621,7 @@ impl Ontology {
610621
version_properties,
611622
last_updated: None,
612623
namespace_map,
624+
content_hash: None,
613625
})
614626
}
615627

0 commit comments

Comments
 (0)