diff --git a/.github/workflows/artifacts.yml b/.github/workflows/artifacts.yml index b564b1e..39f4673 100644 --- a/.github/workflows/artifacts.yml +++ b/.github/workflows/artifacts.yml @@ -282,22 +282,33 @@ jobs: runs-on: ubuntu-latest environment: publish steps: - - uses: actions/checkout@v3 + - name: Checkout sources + uses: actions/checkout@v3 with: submodules: true - - run: rustup update - - uses: katyo/publish-crates@v2 + - name: Publish ontoenv crate + uses: katyo/publish-crates@v2 with: path: './lib' registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} - - uses: katyo/publish-crates@v2 + - name: Publish ontoenv-cli crate + uses: katyo/publish-crates@v2 with: path: './cli' registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + - name: Wait for crate index propagation + run: sleep 45 + - name: Publish pyontoenv crate + uses: katyo/publish-crates@v2 + with: + path: './python' + registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} full_archive: if: github.event_name == 'release' runs-on: ubuntu-latest diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 30d97bc..b0c77ff 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -42,6 +42,6 @@ jobs: run: uv run maturin build --release --features abi3 working-directory: ./python - name: Test python package - run: uv run python -m unittest test.py + run: uv run python -m unittest discover -s tests working-directory: ./python diff --git a/Cargo.lock b/Cargo.lock index c964286..c703a0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,9 +23,9 @@ dependencies = [ [[package]] name = "addr2line" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" dependencies = [ "gimli", ] @@ -68,18 +68,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - -[[package]] -name = "android-tzdata" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" - [[package]] name = "android_system_properties" version = "0.1.5" @@ -91,9 +79,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.19" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", @@ -106,9 +94,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.11" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anstyle-parse" @@ -121,29 +109,35 @@ dependencies = [ [[package]] name = "anstyle-query" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "anstyle-wincon" -version = "3.0.9" +version = "3.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "anyhow" -version = "1.0.98" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + +[[package]] +name = "atomic-waker" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "atty" @@ -164,9 +158,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "backtrace" -version = "0.3.75" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" dependencies = [ "addr2line", "cfg-if", @@ -174,7 +168,7 @@ dependencies = [ "miniz_oxide", "object", "rustc-demangle", - "windows-targets 0.52.6", + "windows-link", ] [[package]] @@ -185,9 +179,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bindgen" -version = "0.71.1" +version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ "bitflags", "cexpr", @@ -200,14 +194,14 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "bitflags" -version = "2.9.1" +version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" [[package]] name = "bitmaps" @@ -265,10 +259,11 @@ checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" [[package]] name = "cc" -version = "1.2.27" +version = "1.2.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" +checksum = "e1d05d92f4b1fd76aad469d46cdd858ca761576082cd37df81416691e50199fb" dependencies = [ + "find-msvc-tools", "jobserver", "libc", "shlex", @@ -294,9 +289,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.1" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" [[package]] name = "cfg_aliases" @@ -306,11 +301,10 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.41" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" dependencies = [ - "android-tzdata", "iana-time-zone", "js-sys", "num-traits", @@ -359,9 +353,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.40" +version = "4.5.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" +checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" dependencies = [ "clap_builder", "clap_derive", @@ -369,9 +363,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.40" +version = "4.5.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" +checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" dependencies = [ "anstream", "anstyle", @@ -381,14 +375,14 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.40" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" +checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -452,8 +446,18 @@ version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", ] [[package]] @@ -467,7 +471,21 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.104", + "syn 2.0.106", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.106", ] [[package]] @@ -476,9 +494,20 @@ version = "0.20.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ - "darling_core", + "darling_core 0.20.11", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -503,12 +532,12 @@ checksum = "5440d1dc8ea7cae44cda3c64568db29bfa2434aba51ae66a50c00488841a65a3" [[package]] name = "deranged" -version = "0.4.0" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" +checksum = "a41953f86f8a05768a6cda24def994fd2f424b04ec5c719cf89989779f199071" dependencies = [ "powerfmt", - "serde", + "serde_core", ] [[package]] @@ -526,10 +555,10 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ - "darling", + "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -539,7 +568,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -560,14 +589,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "dyn-clone" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] name = "educe" @@ -590,7 +619,7 @@ dependencies = [ "enum-ordinalize 4.3.0", "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -609,7 +638,7 @@ dependencies = [ "num-traits", "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -629,7 +658,7 @@ checksum = "0d28318a75d4aead5c4db25382e8ef717932d0346600cacae6357eb5941bc5ff" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -663,12 +692,12 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.13" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.1", ] [[package]] @@ -677,6 +706,12 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "find-msvc-tools" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0399f9d26e5191ce32c498bebd31e7a3ceabc2745f0ac54af3f335126c3f24b3" + [[package]] name = "fixedbitset" version = "0.5.7" @@ -697,9 +732,9 @@ checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" [[package]] name = "form_urlencoded" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" dependencies = [ "percent-encoding", ] @@ -776,7 +811,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -821,9 +856,9 @@ dependencies = [ [[package]] name = "getopts" -version = "0.2.23" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cba6ae63eb948698e300f645f87c70f76630d505f23b8907cf1e193ee85048c1" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" dependencies = [ "unicode-width", ] @@ -851,21 +886,21 @@ dependencies = [ "js-sys", "libc", "r-efi", - "wasi 0.14.2+wasi-0.2.4", + "wasi 0.14.7+wasi-0.2.4", "wasm-bindgen", ] [[package]] name = "gimli" -version = "0.31.1" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" [[package]] name = "glob" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "half" @@ -903,15 +938,19 @@ checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" [[package]] name = "hashbrown" -version = "0.15.4" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "allocator-api2", - "equivalent", "foldhash", ] +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + [[package]] name = "heck" version = "0.5.0" @@ -981,18 +1020,20 @@ checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" [[package]] name = "hyper" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" +checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" dependencies = [ + "atomic-waker", "bytes", "futures-channel", - "futures-util", + "futures-core", "http", "http-body", "httparse", "itoa", "pin-project-lite", + "pin-utils", "smallvec", "tokio", "want", @@ -1017,9 +1058,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.14" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc2fdfdbff08affe55bb779f33b053aa1fe5dd5b54c257343c17edfa55711bdb" +checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" dependencies = [ "base64", "bytes", @@ -1041,9 +1082,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.63" +version = "0.1.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -1157,9 +1198,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" dependencies = [ "idna_adapter", "smallvec", @@ -1203,13 +1244,14 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.10.0" +version = "2.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" +checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" dependencies = [ "equivalent", - "hashbrown 0.15.4", + "hashbrown 0.16.0", "serde", + "serde_core", ] [[package]] @@ -1218,6 +1260,17 @@ version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd" +[[package]] +name = "io-uring" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" +dependencies = [ + "bitflags", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -1297,14 +1350,14 @@ checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "jobserver" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ "getrandom 0.3.3", "libc", @@ -1312,9 +1365,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.77" +version = "0.3.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +checksum = "ec48937a97411dcb524a265206ccd4c90bb711fca92b2792c407f268825b9305" dependencies = [ "once_cell", "wasm-bindgen", @@ -1356,7 +1409,7 @@ dependencies = [ "contextual", "educe 0.4.23", "futures", - "indexmap 2.10.0", + "indexmap 2.11.4", "iref", "json-ld-context-processing", "json-ld-core", @@ -1396,7 +1449,7 @@ dependencies = [ "educe 0.4.23", "futures", "hashbrown 0.13.2", - "indexmap 2.10.0", + "indexmap 2.11.4", "iref", "json-ld-syntax", "json-syntax", @@ -1423,7 +1476,7 @@ dependencies = [ "contextual", "educe 0.4.23", "futures", - "indexmap 2.10.0", + "indexmap 2.11.4", "iref", "json-ld-context-processing", "json-ld-core", @@ -1441,7 +1494,7 @@ version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "344f0a6042745d76a358b808878ae0d125a472de30b3eabc9eb82c6cf7f0c23e" dependencies = [ - "indexmap 2.10.0", + "indexmap 2.11.4", "iref", "json-ld-core", "json-syntax", @@ -1461,7 +1514,7 @@ dependencies = [ "decoded-char", "educe 0.4.23", "hashbrown 0.13.2", - "indexmap 2.10.0", + "indexmap 2.11.4", "iref", "json-syntax", "langtag", @@ -1519,18 +1572,18 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical" -version = "7.0.4" +version = "7.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ed980ff02623721dc334b9105150b66d0e1f246a92ab5a2eca0335d54c48f6" +checksum = "1bc8a009b2ff1f419ccc62706f04fe0ca6e67b37460513964a3dfdb919bb37d6" dependencies = [ "lexical-core", ] [[package]] name = "lexical-core" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1541,69 +1594,62 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" dependencies = [ "lexical-parse-integer", "lexical-util", - "static_assertions", ] [[package]] name = "lexical-parse-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] name = "lexical-util" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" -dependencies = [ - "static_assertions", -] +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" [[package]] name = "lexical-write-float" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" dependencies = [ "lexical-util", "lexical-write-integer", - "static_assertions", ] [[package]] name = "lexical-write-integer" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" dependencies = [ "lexical-util", - "static_assertions", ] [[package]] name = "libc" -version = "0.2.174" +version = "0.2.176" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" [[package]] name = "libloading" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ "cfg-if", - "windows-targets 0.53.2", + "windows-link", ] [[package]] @@ -1635,15 +1681,15 @@ dependencies = [ "proc-macro2", "quote", "static-iref", - "syn 2.0.104", + "syn 2.0.106", "thiserror 1.0.69", ] [[package]] name = "linux-raw-sys" -version = "0.9.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" @@ -1681,9 +1727,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.27" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" [[package]] name = "lru-slab" @@ -1703,9 +1749,18 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.5" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" + +[[package]] +name = "memmap2" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" +dependencies = [ + "libc", +] [[package]] name = "memoffset" @@ -1811,9 +1866,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.7" +version = "0.37.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" dependencies = [ "memchr", ] @@ -1832,7 +1887,7 @@ checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" [[package]] name = "ontoenv" -version = "0.3.1-a3" +version = "0.4.0-a10" dependencies = [ "anyhow", "chrono", @@ -1847,6 +1902,7 @@ dependencies = [ "oxigraph", "petgraph", "pretty-bytes", + "rdf5d", "regex", "reqwest", "serde", @@ -1860,7 +1916,7 @@ dependencies = [ [[package]] name = "ontoenv-cli" -version = "0.3.1-a3" +version = "0.4.0-a10" dependencies = [ "anyhow", "chrono", @@ -1893,25 +1949,25 @@ dependencies = [ [[package]] name = "oxigraph" -version = "0.4.11" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b57a5334aab94d88e1d24b238c093c5efb0d309614b16ac920f23ad77ee77d" +checksum = "d48182a4fd14994e4be3cd90d0062997aa5ecd33aaeef05bc8e5ceaaf62aa36e" dependencies = [ "dashmap", - "getrandom 0.2.16", + "getrandom 0.3.3", "libc", "oxiri", "oxrdf", "oxrdfio", "oxrocksdb-sys", "oxsdatatypes", - "rand 0.8.5", + "rand 0.9.2", "rustc-hash", "siphasher", "sparesults", "spareval", "spargebra", - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] @@ -1931,64 +1987,63 @@ checksum = "54b4ed3a7192fa19f5f48f99871f2755047fabefd7f222f12a1df1773796a102" [[package]] name = "oxjsonld" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13a1a66dc569350f3f4e5eff8a8e1a72b0c9e6ad395bb5805493cb7a2fda185f" +checksum = "99298d735f570f370228c1d4a5775c5aac6f9cddbeb808dc4b2c1abb25a6590d" dependencies = [ "json-event-parser", "oxiri", "oxrdf", - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] name = "oxrdf" -version = "0.2.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a04761319ef84de1f59782f189d072cbfc3a9a40c4e8bded8667202fbd35b02a" +checksum = "f69f61f7f27474c7e64f26c353c9377626ad4af2e209345a81ca59ef4d7e1cda" dependencies = [ "oxilangtag", "oxiri", "oxsdatatypes", - "rand 0.8.5", - "thiserror 2.0.12", + "rand 0.9.2", + "thiserror 2.0.17", ] [[package]] name = "oxrdfio" -version = "0.1.8" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14d33dd87769786a0bb7de342865e33bf0c6e9872fa76f1ede23e944fdc77898" +checksum = "a654e0073af3dc0faf85ec53138413a5a72c4cb315a7226a646ca8eaf9f95646" dependencies = [ "oxjsonld", "oxrdf", "oxrdfxml", "oxttl", - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] name = "oxrdfxml" -version = "0.1.7" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8d4bf9c5331127f01efbd1245d90fd75b7c546a97cb3e95461121ce1ad5b1c8" +checksum = "018dbf4af41fea91a78a250771361564981ae46f6c7489962a855c0cb66160fa" dependencies = [ "oxilangtag", "oxiri", "oxrdf", "quick-xml", - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] name = "oxrocksdb-sys" -version = "0.4.11" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16430f45934d678cb6f9823e7c1bfdbdce9025f670ad85b642e46ffe5609e6ff" +checksum = "c1bf8590ff65101bc659dba81fa296e89a6e832171b5f885ff85522e8bff6257" dependencies = [ "bindgen", "cc", - "libc", ] [[package]] @@ -1997,20 +2052,20 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06fa874d87eae638daae9b4e3198864fe2cce68589f227c0b2cf5b62b1530516" dependencies = [ - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] name = "oxttl" -version = "0.1.8" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d385f1776d7cace455ef6b7c54407838eff902ca897303d06eb12a26f4cf8a0" +checksum = "c0038cc87e23f95f8b3b94a9f12aeb07b0872b215dee3e005e77a1e93718258f" dependencies = [ "memchr", "oxilangtag", "oxiri", "oxrdf", - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] @@ -2065,9 +2120,9 @@ checksum = "132dca9b868d927b35b5dd728167b2dee150eb1ad686008fc71ccb298b776fca" [[package]] name = "percent-encoding" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "permutohedron" @@ -2077,13 +2132,13 @@ checksum = "b687ff7b5da449d39e418ad391e5e08da53ec334903ddbb921db208908fc372c" [[package]] name = "petgraph" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", - "hashbrown 0.15.4", - "indexmap 2.10.0", + "hashbrown 0.15.5", + "indexmap 2.11.4", "serde", "serde_derive", ] @@ -2100,6 +2155,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "portable-atomic" version = "1.11.1" @@ -2117,9 +2178,9 @@ dependencies = [ [[package]] name = "potential_utf" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +checksum = "84df19adbe5b5a0782edcab45899906947ab039ccf4573713735ee7de1e6b08a" dependencies = [ "zerovec", ] @@ -2160,12 +2221,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.35" +version = "0.2.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061c1221631e079b26479d25bbf2275bfe5917ae8419cd7e34f13bfc2aa7539a" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -2194,9 +2255,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" dependencies = [ "unicode-ident", ] @@ -2247,7 +2308,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -2260,17 +2321,18 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "pyontoenv" -version = "0.3.1-a3" +version = "0.4.0-a10" dependencies = [ "anyhow", "env_logger", "log", "ontoenv", + "ontoenv-cli", "oxigraph", "pyo3", "pyo3-build-config", @@ -2287,9 +2349,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.8" +version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626214629cda6781b6dc1d316ba307189c85ba657213ce642d9c77670f8202c8" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" dependencies = [ "bytes", "cfg_aliases", @@ -2299,7 +2361,7 @@ dependencies = [ "rustc-hash", "rustls", "socket2", - "thiserror 2.0.12", + "thiserror 2.0.17", "tokio", "tracing", "web-time", @@ -2307,20 +2369,20 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.12" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", "getrandom 0.3.3", "lru-slab", - "rand 0.9.1", + "rand 0.9.2", "ring", "rustc-hash", "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.12", + "thiserror 2.0.17", "tinyvec", "tracing", "web-time", @@ -2328,23 +2390,23 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcebb1209ee276352ef14ff8732e24cc2b02bbac986cd74a4c81bcb2f9881970" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" dependencies = [ "cfg_aliases", "libc", "once_cell", "socket2", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.40" +version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" dependencies = [ "proc-macro2", ] @@ -2370,35 +2432,14 @@ dependencies = [ [[package]] name = "rand" -version = "0.8.5" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" -dependencies = [ - "rand_chacha 0.9.0", + "rand_chacha", "rand_core 0.9.3", ] -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.4", -] - [[package]] name = "rand_chacha" version = "0.9.0" @@ -2429,9 +2470,6 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.16", -] [[package]] name = "rand_core" @@ -2471,7 +2509,7 @@ checksum = "4ccfa6b3af8f44db8d700038d47a9e8c8cc4126cdcafc069e82116903420631d" dependencies = [ "contextual", "educe 0.5.11", - "indexmap 2.10.0", + "indexmap 2.11.4", "iref", "langtag", "raw-btree", @@ -2481,6 +2519,16 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "rdf5d" +version = "0.4.0-a10" +dependencies = [ + "clap", + "memmap2", + "oxigraph", + "zstd", +] + [[package]] name = "rdrand" version = "0.4.0" @@ -2492,38 +2540,38 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.13" +version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ "bitflags", ] [[package]] name = "ref-cast" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a0ae411dbe946a674d89546582cea4ba2bb8defac896622d6496f14c23ba5cf" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" dependencies = [ "ref-cast-impl", ] [[package]] name = "ref-cast-impl" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1165225c21bff1f3bbce98f5a1f889949bc902d3575308cc7b0de30b4f6d27c7" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "regex" -version = "1.11.1" +version = "1.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" dependencies = [ "aho-corasick", "memchr", @@ -2533,9 +2581,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" dependencies = [ "aho-corasick", "memchr", @@ -2544,9 +2592,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" [[package]] name = "remove_dir_all" @@ -2565,9 +2613,9 @@ checksum = "51743d3e274e2b18df81c4dc6caf8a5b8e15dbe799e0dca05c7617380094e884" [[package]] name = "reqwest" -version = "0.12.20" +version = "0.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabf4c97d9130e2bf606614eb937e86edac8292eaa6f422f995d7e8de1eb1813" +checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" dependencies = [ "base64", "bytes", @@ -2619,9 +2667,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" [[package]] name = "rustc-hash" @@ -2631,22 +2679,22 @@ checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustix" -version = "1.0.7" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys 0.59.0", + "windows-sys 0.61.1", ] [[package]] name = "rustls" -version = "0.23.28" +version = "0.23.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7160e3e10bf4535308537f3c4e1641468cd0e485175d6163087c0393c7d46643" +checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" dependencies = [ "once_cell", "ring", @@ -2668,9 +2716,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.3" +version = "0.103.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" dependencies = [ "ring", "rustls-pki-types", @@ -2679,9 +2727,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" @@ -2722,6 +2770,18 @@ dependencies = [ "serde_json", ] +[[package]] +name = "schemars" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -2730,34 +2790,45 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ "itoa", "memchr", "ryu", "serde", + "serde_core", ] [[package]] @@ -2784,18 +2855,18 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.13.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf65a400f8f66fb7b0552869ad70157166676db75ed8181f8104ea91cf9d0b42" +checksum = "6093cd8c01b25262b84927e0f7151692158fab02d961e04c979d3903eba7ecc5" dependencies = [ "base64", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.10.0", - "schemars", - "serde", - "serde_derive", + "indexmap 2.11.4", + "schemars 0.9.0", + "schemars 1.0.4", + "serde_core", "serde_json", "serde_with_macros", "time", @@ -2803,14 +2874,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.13.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81679d9ed988d5e9a5e6531dc3f2c28efbd639cbd1dfb628df08edea6004da77" +checksum = "a7e6c180db0816026a61afa1cff5344fb7ebded7e4d3062772179f2501481c27" dependencies = [ - "darling", + "darling 0.21.3", "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -2859,15 +2930,15 @@ dependencies = [ [[package]] name = "slab" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "smallstr" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63b1aefdf380735ff8ded0b15f31aab05daf1f70216c01c02a12926badd1df9d" +checksum = "862077b1e764f04c251fe82a2ef562fd78d7cadaeb072ca7c2bcaf7217b1ff3b" dependencies = [ "smallvec", ] @@ -2880,32 +2951,32 @@ checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "socket2" -version = "0.5.10" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "sparesults" -version = "0.2.5" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f478f5ead16b6136bccee7a52ea43a615f8512086708f515e26ce33e0b184036" +checksum = "5c5a58e0d210eeec459dc99de3144fbb1ba32686cd7cb55cbfabe866fc6d149c" dependencies = [ "json-event-parser", "memchr", "oxrdf", "quick-xml", - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] name = "spareval" -version = "0.1.4" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d8ff5f1159e7416ed99160b962fa780851dddee133ef56e6b08a94023ea2c7" +checksum = "909dde81fd27deb229ccec00eb95671bede9b59d9cd24900318209be8d069868" dependencies = [ "hex", "json-event-parser", @@ -2913,7 +2984,7 @@ dependencies = [ "oxiri", "oxrdf", "oxsdatatypes", - "rand 0.8.5", + "rand 0.9.2", "regex", "rustc-hash", "sha1", @@ -2921,31 +2992,31 @@ dependencies = [ "sparesults", "spargebra", "sparopt", - "thiserror 2.0.12", + "thiserror 2.0.17", ] [[package]] name = "spargebra" -version = "0.3.5" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8907e262be4b4b363218f4688f5654d423a958aa4b8d7c7a7f898be591fa474e" +checksum = "7ac2df69ac11ac5da54a6731e68ca935e8f1599292fe97d2bde341c48e6ea301" dependencies = [ "oxilangtag", "oxiri", "oxrdf", "peg", - "rand 0.8.5", - "thiserror 2.0.12", + "rand 0.9.2", + "thiserror 2.0.17", ] [[package]] name = "sparopt" -version = "0.2.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1790bbdf13560c2afc245ab0f82a489003b3918e668ebd45c65fe46bfd7a1763" +checksum = "27a7108d6d7d92df771445ea1dd10739d520e90bf42a7be69a7212144b1320ca" dependencies = [ "oxrdf", - "rand 0.8.5", + "rand 0.9.2", "spargebra", ] @@ -2963,7 +3034,7 @@ checksum = "3cc4068497ae43896d41174586dcdc2153a1af2c82856fb308bfaaddc28e5549" dependencies = [ "iref", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -2982,16 +3053,10 @@ dependencies = [ "quote", "serde", "sha2", - "syn 2.0.104", + "syn 2.0.106", "thiserror 1.0.69", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "strsim" version = "0.11.1" @@ -3017,9 +3082,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.104" +version = "2.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" dependencies = [ "proc-macro2", "quote", @@ -3043,14 +3108,14 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "target-lexicon" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" +checksum = "df7f62577c25e07834649fc3b39fafdc597c0a3527dc1c60129201ccfcbaa50c" [[package]] name = "tempdir" @@ -3064,15 +3129,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.20.0" +version = "3.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", "getrandom 0.3.3", "once_cell", "rustix", - "windows-sys 0.59.0", + "windows-sys 0.61.1", ] [[package]] @@ -3086,11 +3151,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.12" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl 2.0.12", + "thiserror-impl 2.0.17", ] [[package]] @@ -3101,25 +3166,25 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "thiserror-impl" -version = "2.0.12" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "time" -version = "0.3.41" +version = "0.3.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" +checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" dependencies = [ "deranged", "itoa", @@ -3132,15 +3197,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.4" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" +checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" [[package]] name = "time-macros" -version = "0.2.22" +version = "0.2.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" dependencies = [ "num-conv", "time-core", @@ -3158,9 +3223,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ "tinyvec_macros", ] @@ -3173,24 +3238,26 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.45.1" +version = "1.47.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" dependencies = [ "backtrace", "bytes", + "io-uring", "libc", "mio", "pin-project-lite", + "slab", "socket2", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "tokio-rustls" -version = "0.26.2" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ "rustls", "tokio", @@ -3268,15 +3335,15 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "typenum" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "unicode-ident" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" [[package]] name = "unicode-width" @@ -3298,13 +3365,14 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.4" +version = "2.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -3358,44 +3426,54 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasi" -version = "0.14.2+wasi-0.2.4" +version = "0.14.7+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" +dependencies = [ + "wasip2", +] + +[[package]] +name = "wasip2" +version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ - "wit-bindgen-rt", + "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.100" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +checksum = "c1da10c01ae9f1ae40cbfac0bac3b1e724b320abfcf52229f80b547c0d250e2d" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", + "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.100" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +checksum = "671c9a5a66f49d8a47345ab942e2cb93c7d1d0339065d4f8139c486121b43b19" dependencies = [ "bumpalo", "log", "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.50" +version = "0.4.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +checksum = "7e038d41e478cc73bae0ff9b36c60cff1c98b8f38f8d7e8061e79ee63608ac5c" dependencies = [ "cfg-if", "js-sys", @@ -3406,9 +3484,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.100" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +checksum = "7ca60477e4c59f5f2986c50191cd972e3a50d8a95603bc9434501cf156a9a119" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3416,31 +3494,31 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.100" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.100" +version = "0.2.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +checksum = "bad67dc8b2a1a6e5448428adec4c3e84c43e561d8c9ee8a9e5aabeb193ec41d1" dependencies = [ "unicode-ident", ] [[package]] name = "web-sys" -version = "0.3.77" +version = "0.3.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +checksum = "9367c417a924a74cae129e6a2ae3b47fabb1f8995595ab474029da749a8be120" dependencies = [ "js-sys", "wasm-bindgen", @@ -3458,9 +3536,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8782dd5a41a24eed3a4f40b606249b3e236ca61adf1f25ea4d45c73de122b502" +checksum = "7e8983c3ab33d6fb807cfcdad2491c4ea8cbc8ed839181c7dfd9c67c83e261b2" dependencies = [ "rustls-pki-types", ] @@ -3483,11 +3561,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.9" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.1", ] [[package]] @@ -3498,9 +3576,9 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-core" -version = "0.61.2" +version = "0.62.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" +checksum = "6844ee5416b285084d3d3fffd743b925a6c9385455f64f6d4fa3031c4c2749a9" dependencies = [ "windows-implement", "windows-interface", @@ -3511,46 +3589,46 @@ dependencies = [ [[package]] name = "windows-implement" -version = "0.60.0" +version = "0.60.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +checksum = "edb307e42a74fb6de9bf3a02d9712678b22399c87e6fa869d6dfcd8c1b7754e0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "windows-interface" -version = "0.59.1" +version = "0.59.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +checksum = "c0abd1ddbc6964ac14db11c7213d6532ef34bd9aa042c2e5935f59d7908b46a5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] name = "windows-link" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +checksum = "45e46c0661abb7180e7b9c281db115305d49ca1709ab8242adf09666d2173c65" [[package]] name = "windows-result" -version = "0.3.4" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +checksum = "7084dcc306f89883455a206237404d3eaf961e5bd7e0f312f7c91f57eb44167f" dependencies = [ "windows-link", ] [[package]] name = "windows-strings" -version = "0.4.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +checksum = "7218c655a553b0bed4426cf54b20d7ba363ef543b52d515b3e48d7fd55318dda" dependencies = [ "windows-link", ] @@ -3579,7 +3657,16 @@ version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.2", + "windows-targets 0.53.4", +] + +[[package]] +name = "windows-sys" +version = "0.61.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f109e41dd4a3c848907eb83d5a42ea98b3769495597450cf6d153507b166f0f" +dependencies = [ + "windows-link", ] [[package]] @@ -3600,10 +3687,11 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.53.2" +version = "0.53.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +checksum = "2d42b7b7f66d2a06854650af09cfdf8713e427a439c97ad65a6375318033ac4b" dependencies = [ + "windows-link", "windows_aarch64_gnullvm 0.53.0", "windows_aarch64_msvc 0.53.0", "windows_i686_gnu 0.53.0", @@ -3711,13 +3799,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" [[package]] -name = "wit-bindgen-rt" -version = "0.39.0" +name = "wit-bindgen" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" -dependencies = [ - "bitflags", -] +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "writeable" @@ -3765,28 +3850,28 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.26" +version = "0.8.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", ] [[package]] @@ -3806,15 +3891,15 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", "synstructure", ] [[package]] name = "zeroize" -version = "1.8.1" +version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" @@ -3829,9 +3914,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.2" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" dependencies = [ "yoke", "zerofrom", @@ -3846,5 +3931,33 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.104", + "syn 2.0.106", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index c52925d..f54467f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,12 +2,13 @@ members = [ "lib", "cli", - "python" + "python", + "rdf5d" ] resolver = "2" [workspace.package] -version = "0.3.1-a3" +version = "0.4.0-a10" authors = ["Gabe Fierro "] license = "BSD-3-Clause" edition = "2021" @@ -33,10 +34,21 @@ chrono = { version = "0.4.33", features = ["serde"] } petgraph = { version = "0.8", features = ["serde-1"] } clap = { version = "4.4.18", features = ["derive"] } derive_builder = "0.20" -oxigraph = "0.4.11" +oxigraph = "0.5" +memmap2 = "0.9" +zstd = "0.13" -ontoenv = { version = "0.3.1-a3", path = "lib" } +rdf5d = { version = "0.4.0-a10", path = "rdf5d", features = ["oxigraph", "zstd"] } + +ontoenv = { version = "0.4.0-a10", path = "lib" } +ontoenv-cli = { version = "0.4.0-a10", path = "cli" } [profile.profiling] inherits = "release" debug = true + +[profile.release] +debug = false +strip = true +lto = "thin" +codegen-units = 1 diff --git a/README.md b/README.md index df2a2ca..1a822f0 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,15 @@ # OntoEnv -`ontoenv` is an environment manager for ontology management. It eventually wants to be a package manager for RDF ontologies and graphs. +`ontoenv` is a lightweight environment manager for RDF ontologies and their imports. It helps you: -- A CLI tool (`cargo install ontoenv-cli`) -- `ontoenv`, a [Rust library](https://docs.rs/ontoenv/latest/ontoenv/) -- `pyontoenv`, a [Python library](https://pypi.org/project/pyontoenv/) +- Discover ontologies locally and on the web +- Resolve and materialize `owl:imports` closures +- Query and export graphs + +Project components: +- CLI: `ontoenv` (installable via `cargo install ontoenv-cli`) +- Rust library: [`ontoenv`](https://docs.rs/ontoenv/latest/ontoenv/) +- Python bindings: [`pyontoenv`](https://pypi.org/project/pyontoenv/) ## Overview @@ -30,19 +35,24 @@ Specifically, `ontoenv` looks for patterns like the following inside local ontol . ``` -When initialized, `ontoenv` searches for all local files defining ontologies, identifies their dependencies, and then recursively pulls in those dependencies, *their* dependencies, and so on. -It saves this in a local [Oxigraph](https://github.com/oxigraph/oxigraph) database inside the local `.ontoenv`. +When initialized, `ontoenv` searches the specified directories for ontology declarations, identifies their `owl:imports`, and recursively pulls in dependencies. Runtime queries operate on an in‑memory Oxigraph store. Persistent on‑disk state uses a compact RDF5D file at `.ontoenv/store.r5tu` with single‑writer, shared reader locking. + +### Canonical IRIs and Source URLs + +Ontologies fetched from a URL often declare a different, usually versioned, ontology IRI inside the file. `ontoenv` now remembers that relationship. When an ontology is added we record the source location and, if its declared name differs, create an alias from the normalized URL to the canonical ontology identifier. Future `owl:imports` that reference the versionless URL will therefore reuse the already downloaded ontology instead of refetching it. Removing an ontology clears any aliases associated with it, and loading an existing environment rebuilds the mapping automatically. -## Command Line Interface +## CLI ### Installation -- If you have Rust installed, you can install the tool with `cargo install ontoenv-cli` +- Install from crates.io with `cargo install --locked ontoenv-cli` +- From a local checkout, run `cargo install --path cli --locked` to build the current workspace +- Install via PyPI with `pip install pyontoenv` to get the CLI together with the Python bindings - Download a binary from the [Releases](https://github.com/gtfierro/ontoenv-rs/releases) tab ### Usage -#### Initialization +#### init Begin by initializing an `ontoenv` workspace in a directory containing some ontology files (Turtle files, etc). @@ -50,7 +60,7 @@ Begin by initializing an `ontoenv` workspace in a directory containing some onto ontoenv init ``` -This may take a couple minutes. `ontoenv` searches for all local files defining ontologies, identifies their dependencies, and then recursively pulls in those dependencies, *their* dependencies, and so on. It is possible to adjust which directories `ontoenv` searches for, which files it traverses, and whether it pulls ontologies from the web. +Initializes `.ontoenv/` in the current directory (or specified root), discovers ontologies, and loads dependencies. You must run `init` once per environment. Subsequent commands will auto‑discover the nearest `.ontoenv/` in parent directories. ```ignore $ ontoenv init -h @@ -75,65 +85,81 @@ Options: Offline mode in particular is helpful when you want to limit which ontologies get loaded. Simply download the ontologies you want, and then enable offline mode. +Examples: +- `ontoenv init` — initialize in current directory +- `ontoenv init ./ontologies ./models` — initialize and search these directories +- `ontoenv init --overwrite --offline ./ontologies` — rebuild from scratch and work offline + #### Local State -`ontoenv` stores its configuration and internal database in a `.ontoenv` directory placed in directory from where you ran `ontoenv init`. +- Directory: `.ontoenv/` +- Persistent store: `.ontoenv/store.r5tu` (RDF5D) +- Lock file: `.ontoenv/store.lock` (single writer, shared readers) -#### Refreshing +### Behavior -Refresh the workspace to account for changes to local files. `ontoenv` will use the timestamps on the local files to determine which files to load. This means that refreshing the workspace is often much faster than a full initialization. +- Discovery: Commands (except `init`) discover an environment by walking up parent directories from the current working directory, looking for `.ontoenv/`. +- Override: Set `ONTOENV_DIR` to point to a specific environment; if it points at a `.ontoenv` directory the parent of that directory is used as the root. +- Creation: Only `ontoenv init` creates an environment on disk. Other commands will error if no environment is found. +- Positional search directories: Only `ontoenv init` accepts positional search directories (LOCATIONS). Other commands ignore trailing positionals. +- Temporary mode: Pass `--temporary` to run with an in‑memory environment (no `.ontoenv/`). -Refreshing the graph uses the same parameters as given during `ontoenv init`. -To change these parameters, just run `ontoenv init` again with the desired flags and parameters. +#### update -#### Importing Dependencies +- Refreshes the environment based on file timestamps and configuration. +- Re‑run `init` to change search paths or flags. -`ontoenv` can import all dependencies (immediate and transitive) into a unified graph. -This is often helpful for passing to reasoners or query processors; while many of these can deal with importing multiple graphs, it is much more convenient to have a single file one can ship around. -We refer to the resulting "unified graph" as the *imports closure*. +Examples: +- `ontoenv update` — refresh only changed/added files +- `ontoenv update --all` — rebuild the in‑memory view from sources -`ontoenv closure ` computes the imports closure and places it into an `output.ttl` file (or a location of your choice). -There are a several flags one can provide for this process +#### closure -```ignore -$ Compute the owl:imports closure of an ontology and write it to a file +Compute and optionally write the imports closure (union of a graph and its transitive imports). Useful for reasoning, exchange, or exporting a single file. -Usage: ontoenv closure [OPTIONS] [DESTINATION] +Examples: +- `ontoenv closure http://example.org/ont/MyOntology` (writes `output.ttl`) +- `ontoenv closure http://example.org/ont/MyOntology result.ttl` (auto‑rewrites SHACL prefixes and removes owl:imports) +- To disable either behavior: + - `ontoenv closure http://example.org/ont/MyOntology --no-rewrite-sh-prefixes` + - `ontoenv closure http://example.org/ont/MyOntology --keep-owl-imports` -Arguments: - The name (URI) of the ontology to compute the closure for - [DESTINATION] The file to write the closure to, defaults to 'output.ttl' +#### get -Options: - --rewrite-sh-prefixes - Rewrite the sh:prefixes declarations to point to the chosen ontology, defaults to true [default: true] [possible values: true, false] - --remove-owl-imports - Remove owl:imports statements from the closure, defaults to true [default: true] [possible values: true, false] - -h, --help - Print help -``` +Retrieve a single ontology graph from the environment and write it to STDOUT or a file in a chosen serialization format. -#### Listing Ontologies +Examples: +- `ontoenv get http://example.org/ont/MyOntology` — prints Turtle to STDOUT +- `ontoenv get http://example.org/ont/MyOntology --format jsonld` — prints JSON‑LD to STDOUT +- `ontoenv get http://example.org/ont/MyOntology --output my.ttl` — writes Turtle to `my.ttl` +- Disambiguate when multiple copies share the same IRI (different locations): + - `ontoenv get http://example.org/ont/MyOntology --location ./ontologies/MyOntology-1.4.ttl` + - `ontoenv get http://example.org/ont/MyOntology -l https://example.org/MyOntology-1.3.ttl` -`ontoenv list-ontologies` will display a list of ontology names in the workspace. +Notes: +- Supported formats: `turtle` (default), `ntriples`, `rdfxml`, `jsonld`. +- `--output` writes to a file; omit to print to STDOUT. +- `--location` accepts a file path or URL and is only needed to disambiguate when multiple sources exist for the same IRI. -`ontoenv dump` will print out an alphabetized list of all ontologies in the workspace, their imports, number of triples, and other metadata. +#### Other commands -If GraphViz is installed, `ontoenv dep-graph` will output a PDF graph representation of the imports closure. +- `ontoenv dump` — show ontologies, imports, sizes, and metadata +- `ontoenv dep-graph` — export a GraphViz import dependency graph (PDF) if GraphViz is available +- `ontoenv status` — human-friendly status; add `--json` for machine‑readable +- `ontoenv update` — refresh discovered ontologies +- `ontoenv list ontologies` — ontology names in the environment; add `--json` for JSON array +- `ontoenv list missing` — missing imports (i.e. not found in environment); add `--json` for JSON array +- `ontoenv why [ ...]` — show who imports the given ontology as paths; add `--json` to emit a single JSON document mapping each IRI to path arrays -## Python Library +## Python API (`pyontoenv`) ##### Installation -`pip install pyontoenv` +`pip install pyontoenv` (requires Python 3.9+; prebuilt wheels ship for common platforms. Building from source needs a Rust toolchain.) -#### Usage +### Basic usage -Here is a basic example of how to use the `pyontoenv` Python library. This example will: -1. Create a temporary directory. -2. Write two simple ontologies to files in that directory, where one imports the other. -3. Configure and initialize `ontoenv` to use this directory. -4. Compute the dependency closure of one ontology to demonstrate that `ontoenv` correctly resolves and includes the imported ontology. +Example: create a temporary environment, discover ontologies, and compute a closure. ```python import tempfile @@ -181,6 +207,42 @@ with tempfile.TemporaryDirectory() as temp_dir: assert len(g_a) == 1 ``` +### Key methods + +- Constructor: `OntoEnv(path=None, recreate=False, read_only=False, search_directories=None, require_ontology_names=False, strict=False, offline=False, resolution_policy="default", root=".", includes=None, excludes=None, temporary=False, no_search=False)` + - `offline`: don’t fetch remote ontologies + - `temporary`: in‑memory only (no `.ontoenv/`) +- `update(all=False)`: refresh discovered ontologies +- `add(location, fetch_imports=True) -> str`: add graph from file or URL; returns graph IRI +- `get_graph(name) -> rdflib.Graph`: get just one ontology graph +- `get_closure(name, destination_graph=None, rewrite_sh_prefixes=True, remove_owl_imports=True, recursion_depth=-1) -> (Graph, list[str])` +- `import_dependencies(graph, fetch_missing=False) -> list[str]`: load imports into an rdflib graph, remove its `owl:imports`, and return the sorted IRIs that were imported +- `list_closure(name, recursion_depth=-1) -> list[str]`: list IRIs in the closure +- `get_importers(name) -> list[str]`: ontologies that import `name` +- `to_rdflib_dataset() -> rdflib.Dataset`: in‑memory Dataset with one named graph per ontology +- `store_path() -> Optional[str]`: path to `.ontoenv/` (persistent envs) or `None` (temporary) +- `close()`: persist (if applicable) and release resources + +### Behavior + +- Strict Git‑like: + - Temporary environment: `OntoEnv(temporary=True)` creates an in‑memory environment (no `.ontoenv/`). + - Create/overwrite on disk: `OntoEnv(path=..., recreate=True)` explicitly creates a new environment at `path` (or overwrites if it exists). + - Discover and load: Otherwise, the constructor walks up from `path` (or `root=.` if `path` is None) to find an existing `.ontoenv/`. If found, it loads it; if not, it raises `ValueError` with a hint to use `recreate=True` or `temporary=True`. + - Flags such as `offline`, `strict`, `search_directories`, `includes`, `excludes` apply to created environments; loading respects the saved configuration. + +#### get_closure vs import_dependencies + +- `get_closure(name, ...)` computes the transitive imports closure for the ontology identified by `name` and builds the union of all graphs in that closure. + - Returns: a new rdflib Graph (or writes into `destination_graph` if provided) plus the ordered list of ontology IRIs in the closure. + - Options: can rewrite SHACL prefix blocks to the chosen base ontology and remove `owl:imports` statements in the merged result. + - Use when you need a single, self‑contained graph representing an ontology and all of its imports (for reasoning, exchange, or export). + +- `import_dependencies(graph, fetch_missing=False)` scans an existing rdflib Graph for ontology declarations and `owl:imports`, then augments that same Graph by loading the referenced ontologies (from the environment, or from the web if `fetch_missing=True`). + - Returns: the list of ontology IRIs that were imported. + - Mutates: the provided rdflib Graph in‑place (does not create a union graph per se; it enriches the given graph with imported triples). + - Use when you already have a Graph and want to populate it with the triples from its declared imports, respecting your environment’s offline/strict settings. + ## Rust Library [Docs](https://docs.rs/crate/ontoenv) @@ -249,11 +311,11 @@ assert_eq!(ontologies.len(), 2); // Get the dependency closure for ontology B let ont_b_name = NamedNode::new("http://example.com/ontology_b")?; let ont_b_id = env.resolve(ResolveTarget::Graph(ont_b_name)).unwrap(); -let closure = env.get_dependency_closure(&ont_b_id)?; +let closure_ids = env.get_closure(&ont_b_id, -1)?; // The closure should contain both ontology A and B -assert_eq!(closure.len(), 2); -let closure_names: HashSet = closure.iter().map(|id| id.to_uri_string()).collect(); +assert_eq!(closure_ids.len(), 2); +let closure_names: HashSet = closure_ids.iter().map(|id| id.to_uri_string()).collect(); println!("Closure contains: {:?}", closure_names); assert!(closure_names.contains("http://example.com/ontology_a")); assert!(closure_names.contains("http://example.com/ontology_b")); @@ -264,3 +326,40 @@ fs::remove_dir_all(&test_dir)?; # Ok(()) # } ``` + +### Core Rust API (selected) + +- `OntoEnv::init(config, overwrite) -> OntoEnv` +- `OntoEnv::load_from_directory(root, read_only) -> OntoEnv` +- `OntoEnv::update_all(all: bool)` +- `OntoEnv::add(location, Overwrite, RefreshStrategy) -> GraphIdentifier` +- `OntoEnv::add_no_imports(location, Overwrite, RefreshStrategy) -> GraphIdentifier` +- `OntoEnv::get_graph(id) -> Graph` +- `OntoEnv::get_union_graph(ids)` and `get_closure(id, recursion_depth)` +- `OntoEnv::save_to_directory()`, `flush()` (persists to `.ontoenv/store.r5tu`) + +Persistent storage details +- On-disk: RDF5D file `.ontoenv/store.r5tu` (single writer, shared readers, atomic writes) +- Runtime: in-memory Oxigraph store for fast queries + +### Behavior + +- Discovery helpers: + - `find_ontoenv_root()` and `find_ontoenv_root_from(path)`: walk up parent directories to locate the root that contains `.ontoenv/`. + - Load: `OntoEnv::load_from_directory(root, read_only)` loads an existing environment. +- Creation: + - `OntoEnv::init(config, overwrite)` explicitly creates (or overwrites) an environment on disk. + - `OntoEnv::add(..., Overwrite::Allow, RefreshStrategy::UseCache)` is the common way to add an ontology, while `RefreshStrategy::Force` skips cache reuse. +- Recommended pattern: + - Try discovery (`find_ontoenv_root()`), then `load_from_directory`; if not found, prompt/init explicitly. + - Use `config.temporary = true` (via `Config::builder`) and `OntoEnv::init` for in‑memory use cases. + +### Option enums + +The Rust API now exposes expressive enums instead of opaque booleans: + +- `Overwrite::{Allow, Preserve}` — replace existing graphs or keep the original. +- `RefreshStrategy::{Force, UseCache}` — bypass or reuse cached ontologies. +- `CacheMode::{Enabled, Disabled}` — persisted in `Config` and mirrored in Python as the `use_cached_ontologies` boolean. + +From older code that passed `true`/`false`, use `Overwrite::Allow`/`Preserve` and `RefreshStrategy::Force`/`UseCache`. `bool` values still convert via `Into`, so existing code can migrate incrementally. diff --git a/cli/Cargo.toml b/cli/Cargo.toml index f3636aa..070b3e6 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -9,6 +9,11 @@ repository.workspace = true homepage.workspace = true build = "build.rs" +[lib] +name = "ontoenv_cli" +path = "src/lib.rs" +doc = false + [[bin]] name = "ontoenv" path = "src/main.rs" diff --git a/cli/src/lib.rs b/cli/src/lib.rs new file mode 100644 index 0000000..d49d0d4 --- /dev/null +++ b/cli/src/lib.rs @@ -0,0 +1,934 @@ +use anyhow::{Error, Result}; +use clap::{Parser, Subcommand}; +use log::info; +use ontoenv::api::{OntoEnv, ResolveTarget}; +use ontoenv::config::Config; +use ontoenv::ontology::{GraphIdentifier, OntologyLocation}; +use ontoenv::options::{Overwrite, RefreshStrategy}; +use ontoenv::util::write_dataset_to_file; +use ontoenv::ToUriString; +use oxigraph::io::{JsonLdProfileSet, RdfFormat}; +use oxigraph::model::NamedNode; +use std::collections::{BTreeMap, BTreeSet}; +use std::env::current_dir; +use std::ffi::OsString; +use std::path::PathBuf; + +#[derive(Debug, Parser)] +#[command(name = "ontoenv")] +#[command(about = "Ontology environment manager")] +#[command(arg_required_else_help = true)] +struct Cli { + #[command(subcommand)] + command: Commands, + /// Verbose mode - sets the RUST_LOG level to info, defaults to warning level + #[clap(long, short, action, default_value = "false", global = true)] + verbose: bool, + /// Debug mode - sets the RUST_LOG level to debug, defaults to warning level + #[clap(long, action, default_value = "false", global = true)] + debug: bool, + /// Resolution policy for determining which ontology to use when there are multiple with the same name + #[clap(long, short, default_value = "default", global = true)] + policy: Option, + /// Temporary (non-persistent) mode - will not save the environment to disk + #[clap(long, short, action, global = true)] + temporary: bool, + /// Require ontology names to be unique; will raise an error if multiple ontologies have the same name + #[clap(long, action, global = true)] + require_ontology_names: bool, + /// Strict mode - will raise an error if an ontology is not found + #[clap(long, action, default_value = "false", global = true)] + strict: bool, + /// Offline mode - will not attempt to fetch ontologies from the web + #[clap(long, short, action, default_value = "false", global = true)] + offline: bool, + /// Glob patterns for which files to include, defaults to ['*.ttl','*.xml','*.n3'] + #[clap(long, short, num_args = 1.., global = true)] + includes: Vec, + /// Glob patterns for which files to exclude, defaults to [] + #[clap(long, short, num_args = 1.., global = true)] + excludes: Vec, + /// Do not search for ontologies in the search directories + #[clap(long = "no-search", short = 'n', action, global = true)] + no_search: bool, +} + +#[derive(Debug, Subcommand)] +enum ConfigCommands { + /// Set a configuration value. + Set { + /// The configuration key to set. + key: String, + /// The value to set for the key. + value: String, + }, + /// Get a configuration value. + Get { + /// The configuration key to get. + key: String, + }, + /// Unset a configuration value, reverting to its default. + Unset { + /// The configuration key to unset. + key: String, + }, + /// Add a value to a list-based configuration key. + Add { + /// The configuration key to add to. + key: String, + /// The value to add. + value: String, + }, + /// Remove a value from a list-based configuration key. + Remove { + /// The configuration key to remove from. + key: String, + /// The value to remove. + value: String, + }, + /// List all configuration values. + List, +} + +#[derive(Debug, Subcommand)] +enum ListCommands { + /// List all ontology locations found in the search paths + Locations, + /// List all declared ontologies in the environment + Ontologies, + /// List all missing imports + Missing, +} + +#[derive(Debug, Subcommand)] +enum Commands { + /// Create a new ontology environment + Init { + /// Overwrite the environment if it already exists + #[clap(long, default_value = "false")] + overwrite: bool, + /// Directories to search for ontologies. If not provided, the current directory is used. + #[clap(last = true)] + locations: Option>, + }, + /// Prints the version of the ontoenv binary + Version, + /// Prints the status of the ontology environment + Status { + /// Output JSON instead of text + #[clap(long, action, default_value = "false")] + json: bool, + }, + /// Update the ontology environment + Update { + /// Suppress per-ontology update output + #[clap(long, short = 'q', action)] + quiet: bool, + /// Update all ontologies, ignoring modification times + #[clap(long, short = 'a', action)] + all: bool, + /// Output JSON instead of text + #[clap(long, action, default_value = "false")] + json: bool, + }, + /// Compute the owl:imports closure of an ontology and write it to a file + Closure { + /// The name (URI) of the ontology to compute the closure for + ontology: String, + /// Do NOT rewrite sh:prefixes (rewrite is ON by default) + #[clap(long, action, default_value = "false")] + no_rewrite_sh_prefixes: bool, + /// Keep owl:imports statements (removal is ON by default) + #[clap(long, action, default_value = "false")] + keep_owl_imports: bool, + /// The file to write the closure to, defaults to 'output.ttl' + destination: Option, + /// The recursion depth for exploring owl:imports. <0: unlimited, 0: no imports, >0: + /// specific depth. + #[clap(long, default_value = "-1")] + recursion_depth: i32, + }, + /// Retrieve a single graph from the environment and write it to STDOUT or a file + Get { + /// Ontology IRI (name) + ontology: String, + /// Optional source location (file path or URL) to disambiguate + #[clap(long, short = 'l')] + location: Option, + /// Output file path; if omitted, writes to STDOUT + #[clap(long)] + output: Option, + /// Serialization format: one of [turtle, ntriples, rdfxml, jsonld] (default: turtle) + #[clap(long, short = 'f')] + format: Option, + }, + /// Add an ontology to the environment + Add { + /// The location of the ontology to add (file path or URL) + location: String, + /// Do not explore owl:imports of the added ontology + #[clap(long, action)] + no_imports: bool, + }, + /// List various properties of the environment + /// List various properties of the environment + List { + #[command(subcommand)] + list_cmd: ListCommands, + /// Output JSON instead of text + #[clap(long, action, default_value = "false")] + json: bool, + }, + // TODO: dump all ontologies; nest by ontology name (sorted), w/n each ontology name list all + // the places where that graph can be found. List basic stats: the metadata field in the + // Ontology struct and # of triples in the graph; last updated; etc + /// Print out the current state of the ontology environment + Dump { + /// Filter the output to only include ontologies that contain the given string in their + /// name. Leave empty to include all ontologies. + contains: Option, + }, + /// Generate a PDF of the dependency graph + DepGraph { + /// The root ontologies to start the graph from. Given by name (URI) + roots: Option>, + /// The output file to write the PDF to, defaults to 'dep_graph.pdf' + #[clap(long, short)] + output: Option, + }, + /// Lists which ontologies import the given ontology + Why { + /// The name (URI) of the ontology to find importers for + ontologies: Vec, + /// Output JSON instead of text + #[clap(long, action, default_value = "false")] + json: bool, + }, + /// Run the doctor to check the environment for issues + Doctor { + /// Output JSON instead of text + #[clap(long, action, default_value = "false")] + json: bool, + }, + /// Reset the ontology environment by removing the .ontoenv directory + Reset { + #[clap(long, short, action = clap::ArgAction::SetTrue, default_value = "false")] + force: bool, + }, + /// Manage ontoenv configuration. + #[command(subcommand)] + Config(ConfigCommands), +} + +impl ToString for Commands { + fn to_string(&self) -> String { + match self { + Commands::Init { .. } => "Init".to_string(), + Commands::Version => "Version".to_string(), + Commands::Status { .. } => "Status".to_string(), + Commands::Update { .. } => "Update".to_string(), + Commands::Closure { .. } => "Closure".to_string(), + Commands::Get { .. } => "Get".to_string(), + Commands::Add { .. } => "Add".to_string(), + Commands::List { .. } => "List".to_string(), + Commands::Dump { .. } => "Dump".to_string(), + Commands::DepGraph { .. } => "DepGraph".to_string(), + Commands::Why { .. } => "Why".to_string(), + Commands::Doctor { .. } => "Doctor".to_string(), + Commands::Reset { .. } => "Reset".to_string(), + Commands::Config { .. } => "Config".to_string(), + } + } +} + +fn handle_config_command(config_cmd: ConfigCommands, temporary: bool) -> Result<()> { + if temporary { + return Err(anyhow::anyhow!("Cannot manage config in temporary mode.")); + } + let root = ontoenv::api::find_ontoenv_root() + .ok_or_else(|| anyhow::anyhow!("Not in an ontoenv. Use `ontoenv init` to create one."))?; + let config_path = root.join(".ontoenv").join("ontoenv.json"); + if !config_path.exists() { + return Err(anyhow::anyhow!( + "No ontoenv.json found. Use `ontoenv init`." + )); + } + + match config_cmd { + ConfigCommands::List => { + let config_str = std::fs::read_to_string(&config_path)?; + let config_json: serde_json::Value = serde_json::from_str(&config_str)?; + let pretty_json = serde_json::to_string_pretty(&config_json)?; + println!("{}", pretty_json); + return Ok(()); + } + ConfigCommands::Get { ref key } => { + let config_str = std::fs::read_to_string(&config_path)?; + let config_json: serde_json::Value = serde_json::from_str(&config_str)?; + let object = config_json + .as_object() + .ok_or_else(|| anyhow::anyhow!("Invalid config format: not a JSON object."))?; + + if let Some(value) = object.get(key) { + if let Some(s) = value.as_str() { + println!("{}", s); + } else if let Some(arr) = value.as_array() { + for item in arr { + if let Some(s) = item.as_str() { + println!("{}", s); + } else { + println!("{}", item); + } + } + } else { + println!("{}", value); + } + } else { + println!("Configuration key '{}' not set.", key); + } + return Ok(()); + } + _ => {} + } + + // Modifying commands continue here. + let config_str = std::fs::read_to_string(&config_path)?; + let mut config_json: serde_json::Value = serde_json::from_str(&config_str)?; + + let object = config_json + .as_object_mut() + .ok_or_else(|| anyhow::anyhow!("Invalid config format: not a JSON object."))?; + + match config_cmd { + ConfigCommands::Set { key, value } => { + match key.as_str() { + "offline" | "strict" | "require_ontology_names" | "no_search" => { + let bool_val = value.parse::().map_err(|_| { + anyhow::anyhow!("Invalid boolean value for {}: {}", key, value) + })?; + object.insert(key.to_string(), serde_json::Value::Bool(bool_val)); + } + "resolution_policy" => { + object.insert(key.to_string(), serde_json::Value::String(value.clone())); + } + "locations" | "includes" | "excludes" => { + return Err(anyhow::anyhow!( + "Use `ontoenv config add/remove {} ` to modify list values.", + key + )); + } + _ => { + return Err(anyhow::anyhow!( + "Setting configuration for '{}' is not supported.", + key + )); + } + } + println!("Set {} to {}", key, value); + } + ConfigCommands::Unset { key } => { + if object.remove(&key).is_some() { + println!("Unset '{}'.", key); + } else { + return Err(anyhow::anyhow!("Configuration key '{}' not set.", key)); + } + } + ConfigCommands::Add { key, value } => { + match key.as_str() { + "locations" | "includes" | "excludes" => { + let entry = object + .entry(key.clone()) + .or_insert_with(|| serde_json::Value::Array(vec![])); + if let Some(arr) = entry.as_array_mut() { + let new_val = serde_json::Value::String(value.clone()); + if !arr.contains(&new_val) { + arr.push(new_val); + } else { + println!("Value '{}' already exists in {}.", value, key); + return Ok(()); + } + } + } + _ => { + return Err(anyhow::anyhow!( + "Cannot add to configuration key '{}'. It is not a list.", + key + )); + } + } + println!("Added '{}' to {}", value, key); + } + ConfigCommands::Remove { key, value } => { + match key.as_str() { + "locations" | "includes" | "excludes" => { + if let Some(entry) = object.get_mut(&key) { + if let Some(arr) = entry.as_array_mut() { + let val_to_remove = serde_json::Value::String(value.clone()); + if let Some(pos) = arr.iter().position(|x| *x == val_to_remove) { + arr.remove(pos); + } else { + return Err(anyhow::anyhow!( + "Value '{}' not found in {}", + value, + key + )); + } + } + } else { + return Err(anyhow::anyhow!("Configuration key '{}' not set.", key)); + } + } + _ => { + return Err(anyhow::anyhow!( + "Cannot remove from configuration key '{}'. It is not a list.", + key + )); + } + } + println!("Removed '{}' from {}", value, key); + } + _ => unreachable!(), // Get and List are handled above + } + + let new_config_str = serde_json::to_string_pretty(&config_json)?; + std::fs::write(config_path, new_config_str)?; + + Ok(()) +} + +pub fn run() -> Result<()> { + ontoenv::api::init_logging(); + let cmd = Cli::parse(); + execute(cmd) +} + +pub fn run_from_args(args: I) -> Result<()> +where + I: IntoIterator, + T: Into + Clone, +{ + ontoenv::api::init_logging(); + let cmd = Cli::try_parse_from(args).map_err(Error::from)?; + execute(cmd) +} + +fn execute(cmd: Cli) -> Result<()> { + // The RUST_LOG env var is set by `init_logging` if ONTOENV_LOG is present. + // CLI flags for verbosity take precedence. If nothing is set, we default to "warn". + if cmd.debug { + std::env::set_var("RUST_LOG", "debug"); + } else if cmd.verbose { + std::env::set_var("RUST_LOG", "info"); + } else if std::env::var("RUST_LOG").is_err() { + // If no CLI flags and no env var is set, default to "warn". + std::env::set_var("RUST_LOG", "warn"); + } + let _ = env_logger::try_init(); + + let policy = cmd.policy.unwrap_or_else(|| "default".to_string()); + + let mut builder = Config::builder() + .root(current_dir()?) + .require_ontology_names(cmd.require_ontology_names) + .strict(cmd.strict) + .offline(cmd.offline) + .resolution_policy(policy) + .temporary(cmd.temporary) + .no_search(cmd.no_search); + + // Locations only apply to `init`; other commands ignore positional LOCATIONS + if let Commands::Init { + locations: Some(locs), + .. + } = &cmd.command + { + builder = builder.locations(locs.clone()); + } + // only set includes if they are provided on the command line, otherwise use builder defaults + if !cmd.includes.is_empty() { + builder = builder.includes(&cmd.includes); + } + if !cmd.excludes.is_empty() { + builder = builder.excludes(&cmd.excludes); + } + + let config: Config = builder.build()?; + + if cmd.verbose || cmd.debug { + config.print(); + } + + if let Commands::Reset { force } = &cmd.command { + if let Some(root) = ontoenv::api::find_ontoenv_root() { + let path = root.join(".ontoenv"); + println!("Removing .ontoenv directory at {}...", path.display()); + if !*force { + // check delete? [y/N] + let mut input = String::new(); + println!("Are you sure you want to delete the .ontoenv directory? [y/N] "); + std::io::stdin() + .read_line(&mut input) + .expect("Failed to read line"); + let input = input.trim(); + if input != "y" && input != "Y" { + println!("Aborting..."); + return Ok(()); + } + } + OntoEnv::reset()?; + println!(".ontoenv directory removed."); + } else { + println!("No .ontoenv directory found. Nothing to do."); + } + return Ok(()); + } + + // Discover environment root: ONTOENV_DIR takes precedence, else walk parents + let env_dir_var = std::env::var("ONTOENV_DIR").ok().map(PathBuf::from); + let discovered_root = if let Some(dir) = env_dir_var.clone() { + // If ONTOENV_DIR points to the .ontoenv directory, take its parent as root + if dir.file_name().map(|n| n == ".ontoenv").unwrap_or(false) { + dir.parent().map(|p| p.to_path_buf()) + } else { + Some(dir) + } + } else { + ontoenv::api::find_ontoenv_root() + }; + let ontoenv_exists = discovered_root + .as_ref() + .map(|root| root.join(".ontoenv").join("ontoenv.json").exists()) + .unwrap_or(false); + info!("OntoEnv exists: {ontoenv_exists}"); + + // create the env object to use in the subcommand. + // - if temporary is true, create a new env object each time + // - if temporary is false, load the env from the .ontoenv directory if it exists + // Determine if this command needs write access to the store + let needs_rw = matches!(cmd.command, Commands::Add { .. } | Commands::Update { .. }); + + let env: Option = if cmd.temporary { + // Create a new OntoEnv object in temporary mode + let e = OntoEnv::init(config.clone(), false)?; + Some(e) + } else if cmd.command.to_string() != "Init" && ontoenv_exists { + // if .ontoenv exists, load it from discovered root + // Open read-only unless the command requires write access + Some(OntoEnv::load_from_directory( + discovered_root.unwrap(), + !needs_rw, + )?) + } else { + None + }; + info!("OntoEnv loaded: {}", env.is_some()); + + match cmd.command { + Commands::Init { overwrite, .. } => { + // if temporary, raise an error + if cmd.temporary { + return Err(anyhow::anyhow!( + "Cannot initialize in temporary mode. Run `ontoenv init` without --temporary." + )); + } + + let root = current_dir()?; + if root.join(".ontoenv").exists() && !overwrite { + println!( + "An ontology environment already exists in: {}", + root.display() + ); + println!("Use --overwrite to re-initialize or `ontoenv update` to update."); + + let env = OntoEnv::load_from_directory(root, false)?; + let status = env.status()?; + println!("\nCurrent status:"); + println!("{status}"); + return Ok(()); + } + + // The call to `init` will create and update the environment. + // `update` will also save it to the directory. + let _ = OntoEnv::init(config, overwrite)?; + } + Commands::Get { + ontology, + location, + output, + format, + } => { + let env = require_ontoenv(env)?; + + // If a location is provided, resolve by location. Otherwise resolve by name (IRI). + let graph = if let Some(loc) = location { + let oloc = if loc.starts_with("http://") || loc.starts_with("https://") { + OntologyLocation::Url(loc) + } else { + // Normalize to absolute path + ontoenv::ontology::OntologyLocation::from_str(&loc) + .unwrap_or_else(|_| OntologyLocation::File(PathBuf::from(loc))) + }; + // Read directly from the specified location to disambiguate + oloc.graph()? + } else { + let iri = NamedNode::new(ontology).map_err(|e| anyhow::anyhow!(e.to_string()))?; + let graphid = env + .resolve(ResolveTarget::Graph(iri)) + .ok_or(anyhow::anyhow!("Ontology not found"))?; + env.get_graph(&graphid)? + }; + + let fmt = match format + .as_deref() + .unwrap_or("turtle") + .to_ascii_lowercase() + .as_str() + { + "turtle" | "ttl" => RdfFormat::Turtle, + "ntriples" | "nt" => RdfFormat::NTriples, + "rdfxml" | "xml" => RdfFormat::RdfXml, + "jsonld" | "json-ld" => RdfFormat::JsonLd { + profile: JsonLdProfileSet::default(), + }, + other => { + return Err(anyhow::anyhow!( + "Unsupported format '{}'. Use one of: turtle, ntriples, rdfxml, jsonld", + other + )) + } + }; + + if let Some(path) = output { + let mut file = std::fs::File::create(path)?; + let mut serializer = + oxigraph::io::RdfSerializer::from_format(fmt).for_writer(&mut file); + for t in graph.iter() { + serializer.serialize_triple(t)?; + } + serializer.finish()?; + } else { + let stdout = std::io::stdout(); + let mut handle = stdout.lock(); + let mut serializer = + oxigraph::io::RdfSerializer::from_format(fmt).for_writer(&mut handle); + for t in graph.iter() { + serializer.serialize_triple(t)?; + } + serializer.finish()?; + } + } + Commands::Version => { + println!( + "ontoenv {} @ {}", + env!("CARGO_PKG_VERSION"), + env!("GIT_HASH") + ); + } + Commands::Status { json } => { + let env = require_ontoenv(env)?; + if json { + // Recompute status details similar to env.status() + let ontoenv_dir = current_dir()?.join(".ontoenv"); + let last_updated = if ontoenv_dir.exists() { + Some(std::fs::metadata(&ontoenv_dir)?.modified()?) + as Option + } else { + None + }; + let size: u64 = if ontoenv_dir.exists() { + walkdir::WalkDir::new(&ontoenv_dir) + .into_iter() + .filter_map(Result::ok) + .filter(|e| e.file_type().is_file()) + .filter_map(|e| e.metadata().ok()) + .map(|m| m.len()) + .sum() + } else { + 0 + }; + let missing: Vec = env + .missing_imports() + .into_iter() + .map(|n| n.to_uri_string()) + .collect(); + let last_str = + last_updated.map(|t| chrono::DateTime::::from(t).to_rfc3339()); + let obj = serde_json::json!({ + "exists": true, + "num_ontologies": env.ontologies().len(), + "last_updated": last_str, + "store_size_bytes": size, + "missing_imports": missing, + }); + println!("{}", serde_json::to_string_pretty(&obj)?); + } else { + let status = env.status()?; + println!("{status}"); + } + } + Commands::Update { quiet, all, json } => { + let mut env = require_ontoenv(env)?; + let updated = env.update_all(all)?; + if json { + let arr: Vec = updated.iter().map(|id| id.to_uri_string()).collect(); + println!("{}", serde_json::to_string_pretty(&arr)?); + } else if !quiet { + for id in updated { + if let Some(ont) = env.ontologies().get(&id) { + let name = ont.name().to_string(); + let loc = ont + .location() + .map(|l| l.to_string()) + .unwrap_or_else(|| "N/A".to_string()); + println!("{} @ {}", name, loc); + } + } + } + env.save_to_directory()?; + } + Commands::Closure { + ontology, + no_rewrite_sh_prefixes, + keep_owl_imports, + destination, + recursion_depth, + } => { + // make ontology an IRI + let iri = NamedNode::new(ontology).map_err(|e| anyhow::anyhow!(e.to_string()))?; + let env = require_ontoenv(env)?; + let graphid = env + .resolve(ResolveTarget::Graph(iri.clone())) + .ok_or(anyhow::anyhow!(format!("Ontology {} not found", iri)))?; + let closure = env.get_closure(&graphid, recursion_depth)?; + // Defaults: rewrite prefixes = ON, remove owl:imports = ON; flags disable these. + let rewrite = !no_rewrite_sh_prefixes; + let remove = !keep_owl_imports; + let union = env.get_union_graph(&closure, Some(rewrite), Some(remove))?; + if let Some(failed_imports) = union.failed_imports { + for imp in failed_imports { + eprintln!("{imp}"); + } + } + // write the graph to a file + let destination = destination.unwrap_or_else(|| "output.ttl".to_string()); + write_dataset_to_file(&union.dataset, &destination)?; + } + Commands::Add { + location, + no_imports, + } => { + let location = if location.starts_with("http") { + OntologyLocation::Url(location) + } else { + OntologyLocation::File(PathBuf::from(location)) + }; + let mut env = require_ontoenv(env)?; + if no_imports { + let _ = + env.add_no_imports(location, Overwrite::Allow, RefreshStrategy::UseCache)?; + } else { + let _ = env.add(location, Overwrite::Allow, RefreshStrategy::UseCache)?; + } + } + Commands::List { list_cmd, json } => { + let env = require_ontoenv(env)?; + match list_cmd { + ListCommands::Locations => { + let mut locations = env.find_files()?; + locations.sort_by(|a, b| a.as_str().cmp(b.as_str())); + if json { + println!("{}", serde_json::to_string_pretty(&locations)?); + } else { + for loc in locations { + println!("{}", loc); + } + } + } + ListCommands::Ontologies => { + // print list of ontology URLs from env.ontologies.values() sorted alphabetically + let mut ontologies: Vec<&GraphIdentifier> = env.ontologies().keys().collect(); + ontologies.sort_by(|a, b| a.name().cmp(&b.name())); + ontologies.dedup_by(|a, b| a.name() == b.name()); + if json { + let out: Vec = + ontologies.into_iter().map(|o| o.to_uri_string()).collect(); + println!("{}", serde_json::to_string_pretty(&out)?); + } else { + for ont in ontologies { + println!("{}", ont.to_uri_string()); + } + } + } + ListCommands::Missing => { + let mut missing_imports = env.missing_imports(); + missing_imports.sort(); + if json { + let out: Vec = missing_imports + .into_iter() + .map(|n| n.to_uri_string()) + .collect(); + println!("{}", serde_json::to_string_pretty(&out)?); + } else { + for import in missing_imports { + println!("{}", import.to_uri_string()); + } + } + } + } + } + Commands::Dump { contains } => { + let env = require_ontoenv(env)?; + env.dump(contains.as_deref()); + } + Commands::DepGraph { roots, output } => { + let env = require_ontoenv(env)?; + let dot = if let Some(roots) = roots { + let roots: Vec = roots + .iter() + .map(|iri| { + env.resolve(ResolveTarget::Graph(NamedNode::new(iri).unwrap())) + .unwrap() + .clone() + }) + .collect(); + env.rooted_dep_graph_to_dot(roots)? + } else { + env.dep_graph_to_dot()? + }; + // call graphviz to generate PDF + let dot_path = current_dir()?.join("dep_graph.dot"); + std::fs::write(&dot_path, dot)?; + let output_path = output.unwrap_or_else(|| "dep_graph.pdf".to_string()); + let output = std::process::Command::new("dot") + .args(["-Tpdf", dot_path.to_str().unwrap(), "-o", &output_path]) + .output()?; + if !output.status.success() { + return Err(anyhow::anyhow!( + "Failed to generate PDF: {}", + String::from_utf8_lossy(&output.stderr) + )); + } + } + Commands::Why { ontologies, json } => { + let env = require_ontoenv(env)?; + if json { + let mut all: BTreeMap>> = BTreeMap::new(); + for ont in ontologies { + let iri = NamedNode::new(ont).map_err(|e| anyhow::anyhow!(e.to_string()))?; + let (paths, missing) = match env.explain_import(&iri)? { + ontoenv::api::ImportPaths::Present(paths) => (paths, false), + ontoenv::api::ImportPaths::Missing { importers } => (importers, true), + }; + let formatted = format_import_paths(&iri, paths, missing); + all.insert(iri.to_uri_string(), formatted); + } + println!("{}", serde_json::to_string_pretty(&all)?); + } else { + for ont in ontologies { + let iri = NamedNode::new(ont).map_err(|e| anyhow::anyhow!(e.to_string()))?; + match env.explain_import(&iri)? { + ontoenv::api::ImportPaths::Present(paths) => { + print_import_paths(&iri, paths, false); + } + ontoenv::api::ImportPaths::Missing { importers } => { + print_import_paths(&iri, importers, true); + } + } + } + } + } + Commands::Doctor { json } => { + let env = require_ontoenv(env)?; + let problems = env.doctor()?; + if json { + let out: Vec = problems + .into_iter() + .map(|p| serde_json::json!({ + "message": p.message, + "locations": p.locations.into_iter().map(|loc| loc.to_string()).collect::>() + })) + .collect(); + println!("{}", serde_json::to_string_pretty(&out)?); + } else if problems.is_empty() { + println!("No issues found."); + } else { + println!("Found {} issues:", problems.len()); + for problem in problems { + println!("- {}", problem.message); + for location in problem.locations { + println!(" - {location}"); + } + } + } + } + Commands::Config(config_cmd) => { + handle_config_command(config_cmd, cmd.temporary)?; + } + Commands::Reset { .. } => { + // This command is handled before the environment is loaded. + } + } + + Ok(()) +} + +fn require_ontoenv(env: Option) -> Result { + env.ok_or_else(|| { + anyhow::anyhow!("OntoEnv not found. Run `ontoenv init` to create a new OntoEnv or use -t/--temporary to use a temporary environment.") + }) +} + +fn format_import_paths( + target: &NamedNode, + paths: Vec>, + missing: bool, +) -> Vec> { + let mut unique: BTreeSet> = BTreeSet::new(); + if paths.is_empty() { + if missing { + unique.insert(vec![format!("{} (missing)", target.to_uri_string())]); + } + return unique.into_iter().collect(); + } + for path in paths { + let mut entries: Vec = path.into_iter().map(|id| id.to_uri_string()).collect(); + if missing { + entries.push(format!("{} (missing)", target.to_uri_string())); + } + unique.insert(entries); + } + unique.into_iter().collect() +} + +fn print_import_paths(target: &NamedNode, paths: Vec>, missing: bool) { + if paths.is_empty() { + if missing { + println!( + "Ontology {} is missing but no importers reference it.", + target.to_uri_string() + ); + } else { + println!("No importers found for {}", target.to_uri_string()); + } + return; + } + + println!( + "Why {}{}:", + target.to_uri_string(), + if missing { " (missing)" } else { "" } + ); + + let mut lines: BTreeSet = BTreeSet::new(); + for path in paths { + let mut segments: Vec = path.into_iter().map(|id| id.to_uri_string()).collect(); + if missing { + segments.push(format!("{} (missing)", target.to_uri_string())); + } + lines.insert(segments.join(" -> ")); + } + + for line in lines { + println!("{}", line); + } +} diff --git a/cli/src/main.rs b/cli/src/main.rs index 5c98bea..04d9411 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,651 +1,5 @@ use anyhow::Result; -use clap::{Parser, Subcommand}; -use log::info; -use ontoenv::api::{OntoEnv, ResolveTarget}; -use ontoenv::config::Config; -use ontoenv::ontology::{GraphIdentifier, OntologyLocation}; -use ontoenv::util::write_dataset_to_file; -use ontoenv::ToUriString; -use oxigraph::model::NamedNode; -use serde_json; -use std::env::current_dir; -use std::path::PathBuf; - -#[derive(Debug, Parser)] -#[command(name = "ontoenv")] -#[command(about = "Ontology environment manager")] -#[command(arg_required_else_help = true)] -struct Cli { - #[command(subcommand)] - command: Commands, - /// Verbose mode - sets the RUST_LOG level to info, defaults to warning level - #[clap(long, short, action, default_value = "false", global = true)] - verbose: bool, - /// Debug mode - sets the RUST_LOG level to debug, defaults to warning level - #[clap(long, action, default_value = "false", global = true)] - debug: bool, - /// Resolution policy for determining which ontology to use when there are multiple with the same name - #[clap(long, short, default_value = "default", global = true)] - policy: Option, - /// Temporary (non-persistent) mode - will not save the environment to disk - #[clap(long, short, action, global = true)] - temporary: bool, - /// Require ontology names to be unique; will raise an error if multiple ontologies have the same name - #[clap(long, action, global = true)] - require_ontology_names: bool, - /// Strict mode - will raise an error if an ontology is not found - #[clap(long, action, default_value = "false", global = true)] - strict: bool, - /// Offline mode - will not attempt to fetch ontologies from the web - #[clap(long, short, action, default_value = "false", global = true)] - offline: bool, - /// Glob patterns for which files to include, defaults to ['*.ttl','*.xml','*.n3'] - #[clap(long, short, num_args = 1.., global = true)] - includes: Vec, - /// Glob patterns for which files to exclude, defaults to [] - #[clap(long, short, num_args = 1.., global = true)] - excludes: Vec, - /// Do not search for ontologies in the search directories - #[clap(long = "no-search", short = 'n', action, global = true)] - no_search: bool, - /// Directories to search for ontologies. If not provided, the current directory is used. - #[clap(global = true)] - locations: Option>, -} - -#[derive(Debug, Subcommand)] -enum ConfigCommands { - /// Set a configuration value. - Set { - /// The configuration key to set. - key: String, - /// The value to set for the key. - value: String, - }, - /// Get a configuration value. - Get { - /// The configuration key to get. - key: String, - }, - /// Unset a configuration value, reverting to its default. - Unset { - /// The configuration key to unset. - key: String, - }, - /// Add a value to a list-based configuration key. - Add { - /// The configuration key to add to. - key: String, - /// The value to add. - value: String, - }, - /// Remove a value from a list-based configuration key. - Remove { - /// The configuration key to remove from. - key: String, - /// The value to remove. - value: String, - }, - /// List all configuration values. - List, -} - -#[derive(Debug, Subcommand)] -enum ListCommands { - /// List all ontology locations found in the search paths - Locations, - /// List all declared ontologies in the environment - Ontologies, - /// List all missing imports - Missing, -} - -#[derive(Debug, Subcommand)] -enum Commands { - /// Create a new ontology environment - Init { - /// Overwrite the environment if it already exists - #[clap(long, default_value = "false")] - overwrite: bool, - }, - /// Prints the version of the ontoenv binary - Version, - /// Prints the status of the ontology environment - Status, - /// Update the ontology environment - Update { - /// Suppress per-ontology update output - #[clap(long, short = 'q', action)] - quiet: bool, - /// Update all ontologies, ignoring modification times - #[clap(long, short = 'a', action)] - all: bool, - }, - /// Compute the owl:imports closure of an ontology and write it to a file - Closure { - /// The name (URI) of the ontology to compute the closure for - ontology: String, - /// Rewrite the sh:prefixes declarations to point to the chosen ontology, defaults to true - #[clap(long, short, action, default_value = "true")] - rewrite_sh_prefixes: Option, - /// Remove owl:imports statements from the closure, defaults to true - #[clap(long, short, action, default_value = "true")] - remove_owl_imports: Option, - /// The file to write the closure to, defaults to 'output.ttl' - destination: Option, - /// The recursion depth for exploring owl:imports. <0: unlimited, 0: no imports, >0: - /// specific depth. - #[clap(long, default_value = "-1")] - recursion_depth: i32, - }, - /// Add an ontology to the environment - Add { - /// The location of the ontology to add (file path or URL) - location: String, - /// Do not explore owl:imports of the added ontology - #[clap(long, action)] - no_imports: bool, - }, - /// List various properties of the environment - #[command(subcommand)] - List(ListCommands), - // TODO: dump all ontologies; nest by ontology name (sorted), w/n each ontology name list all - // the places where that graph can be found. List basic stats: the metadata field in the - // Ontology struct and # of triples in the graph; last updated; etc - /// Print out the current state of the ontology environment - Dump { - /// Filter the output to only include ontologies that contain the given string in their - /// name. Leave empty to include all ontologies. - contains: Option, - }, - /// Generate a PDF of the dependency graph - DepGraph { - /// The root ontologies to start the graph from. Given by name (URI) - roots: Option>, - /// The output file to write the PDF to, defaults to 'dep_graph.pdf' - #[clap(long, short)] - output: Option, - }, - /// Lists which ontologies import the given ontology - Why { - /// The name (URI) of the ontology to find importers for - ontologies: Vec, - }, - /// Run the doctor to check the environment for issues - Doctor, - /// Reset the ontology environment by removing the .ontoenv directory - Reset { - #[clap(long, short, action = clap::ArgAction::SetTrue, default_value = "false")] - force: bool, - }, - /// Manage ontoenv configuration. - #[command(subcommand)] - Config(ConfigCommands), -} - -impl ToString for Commands { - fn to_string(&self) -> String { - match self { - Commands::Init { .. } => "Init".to_string(), - Commands::Version => "Version".to_string(), - Commands::Status => "Status".to_string(), - Commands::Update { .. } => "Update".to_string(), - Commands::Closure { .. } => "Closure".to_string(), - Commands::Add { .. } => "Add".to_string(), - Commands::List(..) => "List".to_string(), - Commands::Dump { .. } => "Dump".to_string(), - Commands::DepGraph { .. } => "DepGraph".to_string(), - Commands::Why { .. } => "Why".to_string(), - Commands::Doctor => "Doctor".to_string(), - Commands::Reset { .. } => "Reset".to_string(), - Commands::Config { .. } => "Config".to_string(), - } - } -} - -fn handle_config_command(config_cmd: ConfigCommands, temporary: bool) -> Result<()> { - if temporary { - return Err(anyhow::anyhow!("Cannot manage config in temporary mode.")); - } - let root = ontoenv::api::find_ontoenv_root() - .ok_or_else(|| anyhow::anyhow!("Not in an ontoenv. Use `ontoenv init` to create one."))?; - let config_path = root.join(".ontoenv").join("ontoenv.json"); - if !config_path.exists() { - return Err(anyhow::anyhow!( - "No ontoenv.json found. Use `ontoenv init`." - )); - } - - match config_cmd { - ConfigCommands::List => { - let config_str = std::fs::read_to_string(&config_path)?; - let config_json: serde_json::Value = serde_json::from_str(&config_str)?; - let pretty_json = serde_json::to_string_pretty(&config_json)?; - println!("{}", pretty_json); - return Ok(()); - } - ConfigCommands::Get { ref key } => { - let config_str = std::fs::read_to_string(&config_path)?; - let config_json: serde_json::Value = serde_json::from_str(&config_str)?; - let object = config_json - .as_object() - .ok_or_else(|| anyhow::anyhow!("Invalid config format: not a JSON object."))?; - - if let Some(value) = object.get(key) { - if let Some(s) = value.as_str() { - println!("{}", s); - } else if let Some(arr) = value.as_array() { - for item in arr { - if let Some(s) = item.as_str() { - println!("{}", s); - } else { - println!("{}", item); - } - } - } else { - println!("{}", value); - } - } else { - println!("Configuration key '{}' not set.", key); - } - return Ok(()); - } - _ => {} - } - - // Modifying commands continue here. - let config_str = std::fs::read_to_string(&config_path)?; - let mut config_json: serde_json::Value = serde_json::from_str(&config_str)?; - - let object = config_json - .as_object_mut() - .ok_or_else(|| anyhow::anyhow!("Invalid config format: not a JSON object."))?; - - match config_cmd { - ConfigCommands::Set { key, value } => { - match key.as_str() { - "offline" | "strict" | "require_ontology_names" | "no_search" => { - let bool_val = value.parse::().map_err(|_| { - anyhow::anyhow!("Invalid boolean value for {}: {}", key, value) - })?; - object.insert(key.to_string(), serde_json::Value::Bool(bool_val)); - } - "resolution_policy" => { - object.insert(key.to_string(), serde_json::Value::String(value.clone())); - } - "locations" | "includes" | "excludes" => { - return Err(anyhow::anyhow!( - "Use `ontoenv config add/remove {} ` to modify list values.", - key - )); - } - _ => { - return Err(anyhow::anyhow!( - "Setting configuration for '{}' is not supported.", - key - )); - } - } - println!("Set {} to {}", key, value); - } - ConfigCommands::Unset { key } => { - if object.remove(&key).is_some() { - println!("Unset '{}'.", key); - } else { - return Err(anyhow::anyhow!("Configuration key '{}' not set.", key)); - } - } - ConfigCommands::Add { key, value } => { - match key.as_str() { - "locations" | "includes" | "excludes" => { - let entry = object - .entry(key.clone()) - .or_insert_with(|| serde_json::Value::Array(vec![])); - if let Some(arr) = entry.as_array_mut() { - let new_val = serde_json::Value::String(value.clone()); - if !arr.contains(&new_val) { - arr.push(new_val); - } else { - println!("Value '{}' already exists in {}.", value, key); - return Ok(()); - } - } - } - _ => { - return Err(anyhow::anyhow!( - "Cannot add to configuration key '{}'. It is not a list.", - key - )); - } - } - println!("Added '{}' to {}", value, key); - } - ConfigCommands::Remove { key, value } => { - match key.as_str() { - "locations" | "includes" | "excludes" => { - if let Some(entry) = object.get_mut(&key) { - if let Some(arr) = entry.as_array_mut() { - let val_to_remove = serde_json::Value::String(value.clone()); - if let Some(pos) = arr.iter().position(|x| *x == val_to_remove) { - arr.remove(pos); - } else { - return Err(anyhow::anyhow!( - "Value '{}' not found in {}", - value, - key - )); - } - } - } else { - return Err(anyhow::anyhow!("Configuration key '{}' not set.", key)); - } - } - _ => { - return Err(anyhow::anyhow!( - "Cannot remove from configuration key '{}'. It is not a list.", - key - )); - } - } - println!("Removed '{}' from {}", value, key); - } - _ => unreachable!(), // Get and List are handled above - } - - let new_config_str = serde_json::to_string_pretty(&config_json)?; - std::fs::write(config_path, new_config_str)?; - - Ok(()) -} fn main() -> Result<()> { - ontoenv::api::init_logging(); - let cmd = Cli::parse(); - - // The RUST_LOG env var is set by `init_logging` if ONTOENV_LOG is present. - // CLI flags for verbosity take precedence. If nothing is set, we default to "warn". - if cmd.debug { - std::env::set_var("RUST_LOG", "debug"); - } else if cmd.verbose { - std::env::set_var("RUST_LOG", "info"); - } else if std::env::var("RUST_LOG").is_err() { - // If no CLI flags and no env var is set, default to "warn". - std::env::set_var("RUST_LOG", "warn"); - } - env_logger::init(); - - let policy = cmd.policy.unwrap_or_else(|| "default".to_string()); - - let mut builder = Config::builder() - .root(current_dir()?) - .require_ontology_names(cmd.require_ontology_names) - .strict(cmd.strict) - .offline(cmd.offline) - .resolution_policy(policy) - .temporary(cmd.temporary) - .no_search(cmd.no_search); - - if let Some(locations) = cmd.locations { - builder = builder.locations(locations); - } - // only set includes if they are provided on the command line, otherwise use builder defaults - if !cmd.includes.is_empty() { - builder = builder.includes(&cmd.includes); - } - if !cmd.excludes.is_empty() { - builder = builder.excludes(&cmd.excludes); - } - - let config: Config = builder.build()?; - - if cmd.verbose || cmd.debug { - config.print(); - } - - if let Commands::Reset { force } = &cmd.command { - if let Some(root) = ontoenv::api::find_ontoenv_root() { - let path = root.join(".ontoenv"); - println!("Removing .ontoenv directory at {}...", path.display()); - if !*force { - // check delete? [y/N] - let mut input = String::new(); - println!("Are you sure you want to delete the .ontoenv directory? [y/N] "); - std::io::stdin() - .read_line(&mut input) - .expect("Failed to read line"); - let input = input.trim(); - if input != "y" && input != "Y" { - println!("Aborting..."); - return Ok(()); - } - } - OntoEnv::reset()?; - println!(".ontoenv directory removed."); - } else { - println!("No .ontoenv directory found. Nothing to do."); - } - return Ok(()); - } - - let ontoenv_exists = ontoenv::api::find_ontoenv_root() - .map(|root| root.join(".ontoenv").join("ontoenv.json").exists()) - .unwrap_or(false); - info!("OntoEnv exists: {ontoenv_exists}"); - - // create the env object to use in the subcommand. - // - if temporary is true, create a new env object each time - // - if temporary is false, load the env from the .ontoenv directory if it exists - let env: Option = if cmd.temporary { - // Create a new OntoEnv object in temporary mode - let e = OntoEnv::init(config.clone(), false)?; - Some(e) - } else if cmd.command.to_string() != "Init" && ontoenv_exists { - // if .ontoenv exists, load it - Some(OntoEnv::load_from_directory(current_dir()?, false)?) // no read-only - } else { - None - }; - info!("OntoEnv loaded: {}", env.is_some()); - - match cmd.command { - Commands::Init { overwrite } => { - // if temporary, raise an error - if cmd.temporary { - return Err(anyhow::anyhow!( - "Cannot initialize in temporary mode. Run `ontoenv init` without --temporary." - )); - } - - let root = current_dir()?; - if root.join(".ontoenv").exists() && !overwrite { - println!( - "An ontology environment already exists in: {}", - root.display() - ); - println!("Use --overwrite to re-initialize or `ontoenv update` to update."); - - let env = OntoEnv::load_from_directory(root, false)?; - let status = env.status()?; - println!("\nCurrent status:"); - println!("{status}"); - return Ok(()); - } - - // The call to `init` will create and update the environment. - // `update` will also save it to the directory. - let _ = OntoEnv::init(config, overwrite)?; - } - Commands::Version => { - println!( - "ontoenv {} @ {}", - env!("CARGO_PKG_VERSION"), - env!("GIT_HASH") - ); - } - Commands::Status => { - let env = require_ontoenv(env)?; - // load env from .ontoenv/ontoenv.json - let status = env.status()?; - // pretty print the status - println!("{status}"); - } - Commands::Update { quiet, all } => { - let mut env = require_ontoenv(env)?; - let updated = env.update_all(all)?; - if !quiet { - for id in updated { - if let Some(ont) = env.ontologies().get(&id) { - let name = ont.name().to_string(); - let loc = ont - .location() - .map(|l| l.to_string()) - .unwrap_or_else(|| "N/A".to_string()); - println!("{} @ {}", name, loc); - } - } - } - env.save_to_directory()?; - } - Commands::Closure { - ontology, - rewrite_sh_prefixes, - remove_owl_imports, - destination, - recursion_depth, - } => { - // make ontology an IRI - let iri = NamedNode::new(ontology).map_err(|e| anyhow::anyhow!(e.to_string()))?; - let env = require_ontoenv(env)?; - let graphid = env - .resolve(ResolveTarget::Graph(iri.clone())) - .ok_or(anyhow::anyhow!(format!("Ontology {} not found", iri)))?; - let closure = env.get_closure(&graphid, recursion_depth)?; - let union = env.get_union_graph(&closure, rewrite_sh_prefixes, remove_owl_imports)?; - if let Some(failed_imports) = union.failed_imports { - for imp in failed_imports { - eprintln!("{imp}"); - } - } - // write the graph to a file - let destination = destination.unwrap_or_else(|| "output.ttl".to_string()); - write_dataset_to_file(&union.dataset, &destination)?; - } - Commands::Add { - location, - no_imports, - } => { - let location = if location.starts_with("http") { - OntologyLocation::Url(location) - } else { - OntologyLocation::File(PathBuf::from(location)) - }; - let mut env = require_ontoenv(env)?; - if no_imports { - let _ = env.add_no_imports(location, true)?; - } else { - let _ = env.add(location, true)?; - } - } - Commands::List(list_cmd) => { - let env = require_ontoenv(env)?; - match list_cmd { - ListCommands::Locations => { - let mut locations = env.find_files()?; - locations.sort_by(|a, b| a.as_str().cmp(b.as_str())); - for loc in locations { - println!("{}", loc); - } - } - ListCommands::Ontologies => { - // print list of ontology URLs from env.ontologies.values() sorted alphabetically - let mut ontologies: Vec<&GraphIdentifier> = env.ontologies().keys().collect(); - ontologies.sort_by(|a, b| a.name().cmp(&b.name())); - ontologies.dedup_by(|a, b| a.name() == b.name()); - for ont in ontologies { - println!("{}", ont.to_uri_string()); - } - } - ListCommands::Missing => { - let mut missing_imports = env.missing_imports(); - missing_imports.sort(); - for import in missing_imports { - println!("{}", import.to_uri_string()); - } - } - } - } - Commands::Dump { contains } => { - let env = require_ontoenv(env)?; - env.dump(contains.as_deref()); - } - Commands::DepGraph { roots, output } => { - let env = require_ontoenv(env)?; - let dot = if let Some(roots) = roots { - let roots: Vec = roots - .iter() - .map(|iri| { - env.resolve(ResolveTarget::Graph(NamedNode::new(iri).unwrap())) - .unwrap() - .clone() - }) - .collect(); - env.rooted_dep_graph_to_dot(roots)? - } else { - env.dep_graph_to_dot()? - }; - // call graphviz to generate PDF - let dot_path = current_dir()?.join("dep_graph.dot"); - std::fs::write(&dot_path, dot)?; - let output_path = output.unwrap_or_else(|| "dep_graph.pdf".to_string()); - let output = std::process::Command::new("dot") - .args(["-Tpdf", dot_path.to_str().unwrap(), "-o", &output_path]) - .output()?; - if !output.status.success() { - return Err(anyhow::anyhow!( - "Failed to generate PDF: {}", - String::from_utf8_lossy(&output.stderr) - )); - } - } - Commands::Why { ontologies } => { - let env = require_ontoenv(env)?; - for ont in ontologies { - let iri = NamedNode::new(ont).map_err(|e| anyhow::anyhow!(e.to_string()))?; - let importers = env.get_importers(&iri)?; - println!("Imported by {}: ", iri.to_uri_string()); - for dep in importers { - println!("{}", dep.to_uri_string()); - } - } - } - Commands::Doctor => { - let env = require_ontoenv(env)?; - let problems = env.doctor()?; - if problems.is_empty() { - println!("No issues found."); - } else { - println!("Found {} issues:", problems.len()); - for problem in problems { - println!("- {}", problem.message); - for location in problem.locations { - println!(" - {location}"); - } - } - } - } - Commands::Config(config_cmd) => { - handle_config_command(config_cmd, cmd.temporary)?; - } - Commands::Reset { .. } => { - // This command is handled before the environment is loaded. - } - } - - Ok(()) -} - -fn require_ontoenv(env: Option) -> Result { - env.ok_or_else(|| { - anyhow::anyhow!("OntoEnv not found. Run `ontoenv init` to create a new OntoEnv or use -t/--temporary to use a temporary environment.") - }) + ontoenv_cli::run() } diff --git a/cli/tests/cli_integration.rs b/cli/tests/cli_integration.rs new file mode 100644 index 0000000..bbaf80b --- /dev/null +++ b/cli/tests/cli_integration.rs @@ -0,0 +1,321 @@ +use std::fs; +use std::path::PathBuf; +use std::process::Command; + +fn ontoenv_bin() -> PathBuf { + let mut p = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("target") + .join("debug") + .join(if cfg!(windows) { + "ontoenv.exe" + } else { + "ontoenv" + }); + if !p.exists() { + p = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("..") + .join("target") + .join("release") + .join(if cfg!(windows) { + "ontoenv.exe" + } else { + "ontoenv" + }); + } + assert!(p.exists(), "ontoenv binary not found at {:?}", p); + p +} + +fn tmp_dir(name: &str) -> PathBuf { + let mut d = std::env::current_dir().unwrap(); + d.push(format!( + "target/cli_integration_{}_{}", + name, + std::process::id() + )); + if d.exists() { + let _ = fs::remove_dir_all(&d); + } + fs::create_dir_all(&d).unwrap(); + d +} + +fn write_ttl(path: &PathBuf, ontology_uri: &str, extra: &str) { + let content = format!( + "@prefix rdf: .\n\ + @prefix owl: .\n\ + <{uri}> a owl:Ontology .\n\ + {extra}\n", + uri = ontology_uri, + extra = extra + ); + fs::write(path, content).expect("write ttl"); +} + +// Git-like semantics +#[test] +fn non_init_command_errors_outside_env() { + let exe = ontoenv_bin(); + let root = tmp_dir("noenv"); + let out = Command::new(&exe) + .current_dir(&root) + .env("ONTOENV_DIR", &root) + .arg("list") + .arg("ontologies") + .output() + .expect("run list"); + assert!(!out.status.success(), "expected failure outside env"); +} + +#[test] +fn discovery_from_subdirectory() { + let exe = ontoenv_bin(); + let root = tmp_dir("discover"); + let out = Command::new(&exe) + .current_dir(&root) + .arg("init") + .output() + .expect("run init"); + assert!( + out.status.success(), + "init failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + let nested = root.join("nested"); + fs::create_dir_all(&nested).unwrap(); + let out = Command::new(&exe) + .current_dir(&nested) + .arg("list") + .arg("ontologies") + .output() + .expect("run list"); + assert!( + out.status.success(), + "list failed in subdir: {}", + String::from_utf8_lossy(&out.stderr) + ); +} + +#[test] +fn ontoenv_dir_override() { + let exe = ontoenv_bin(); + let env_root = tmp_dir("envdir"); + let out = Command::new(&exe) + .current_dir(&env_root) + .arg("init") + .output() + .expect("run init"); + assert!( + out.status.success(), + "init failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + let elsewhere = tmp_dir("elsewhere"); + let out = Command::new(&exe) + .current_dir(&elsewhere) + .env("ONTOENV_DIR", env_root.join(".ontoenv")) + .arg("list") + .arg("ontologies") + .output() + .expect("run list"); + assert!( + out.status.success(), + "list failed with ONTOENV_DIR: {}", + String::from_utf8_lossy(&out.stderr) + ); +} + +// Why subcommand integration +#[test] +fn why_lists_importers_paths() { + let exe = ontoenv_bin(); + let root = tmp_dir("why"); + // three ontologies: C imports A; A imports B + let a_uri = "http://example.org/ont/A"; + let b_uri = "http://example.org/ont/B"; + let c_uri = "http://example.org/ont/C"; + let a_path = root.join("A.ttl"); + let b_path = root.join("B.ttl"); + let c_path = root.join("C.ttl"); + write_ttl(&b_path, b_uri, ""); + write_ttl( + &a_path, + a_uri, + &format!("<{}> owl:imports <{}> .", a_uri, b_uri), + ); + write_ttl( + &c_path, + c_uri, + &format!("<{}> owl:imports <{}> .", c_uri, a_uri), + ); + + // init + let out = Command::new(&exe) + .current_dir(&root) + .arg("init") + .output() + .expect("run init"); + assert!(out.status.success()); + + // why B should show A->B and C->A->B + let out = Command::new(&exe) + .current_dir(&root) + .arg("why") + .arg(b_uri) + .output() + .expect("run why"); + assert!(out.status.success()); + let stdout = String::from_utf8_lossy(&out.stdout); + assert!(stdout.contains(&format!("{} -> {}", a_uri, b_uri))); + assert!(stdout.contains(&format!("{} -> {} -> {}", c_uri, a_uri, b_uri))); +} + +// Get command: default Turtle to STDOUT by IRI +#[test] +fn get_stdout_turtle() { + let exe = ontoenv_bin(); + let root = tmp_dir("get_turtle"); + let iri = "http://example.org/ont/Only"; + let path = root.join("only.ttl"); + write_ttl(&path, iri, ""); + + // init + let out = Command::new(&exe) + .current_dir(&root) + .arg("init") + .output() + .expect("run init"); + assert!( + out.status.success(), + "init failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // get to stdout + let out = Command::new(&exe) + .current_dir(&root) + .arg("get") + .arg(iri) + .output() + .expect("run get"); + assert!( + out.status.success(), + "get failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + let stdout = String::from_utf8_lossy(&out.stdout); + // Expect to see the ontology triple in some form + assert!( + stdout.contains(iri), + "stdout did not contain IRI: {}", + stdout + ); +} + +// Get command: JSON-LD output +#[test] +fn get_jsonld_output() { + let exe = ontoenv_bin(); + let root = tmp_dir("get_jsonld"); + let iri = "http://example.org/ont/JL"; + let path = root.join("jl.ttl"); + write_ttl(&path, iri, ""); + + // init + let out = Command::new(&exe) + .current_dir(&root) + .arg("init") + .output() + .expect("run init"); + assert!(out.status.success()); + + // get jsonld to stdout + let out = Command::new(&exe) + .current_dir(&root) + .arg("get") + .arg(iri) + .arg("--format") + .arg("jsonld") + .output() + .expect("run get jsonld"); + assert!( + out.status.success(), + "get jsonld failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + let stdout = String::from_utf8_lossy(&out.stdout); + assert!( + stdout.contains(iri), + "jsonld output missing iri; got: {}", + stdout + ); + assert!( + stdout.trim_start().starts_with("{") || stdout.trim_start().starts_with("["), + "not JSON-LD? {}", + stdout + ); +} + +// Get command: disambiguate with --location when same IRI at two locations +#[test] +fn get_with_location_disambiguates() { + let exe = ontoenv_bin(); + let root = tmp_dir("get_loc"); + let iri = "http://example.org/ont/Dup"; + let p1 = root.join("dup_v1.ttl"); + let p2 = root.join("dup_v2.ttl"); + // add distinguishing triples + write_ttl( + &p1, + iri, + " \"v1\" .", + ); + write_ttl( + &p2, + iri, + " \"v2\" .", + ); + + // init + let out = Command::new(&exe) + .current_dir(&root) + .arg("init") + .output() + .expect("run init"); + assert!(out.status.success()); + + // get with location pointing to v1 + let out = Command::new(&exe) + .current_dir(&root) + .arg("get") + .arg(iri) + .arg("--location") + .arg(p1.to_str().unwrap()) + .output() + .expect("run get v1"); + assert!( + out.status.success(), + "get v1 failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + let s1 = String::from_utf8_lossy(&out.stdout); + assert!(s1.contains("\"v1\""), "expected v1 triple, got: {}", s1); + + // get with location pointing to v2 + let out = Command::new(&exe) + .current_dir(&root) + .arg("get") + .arg(iri) + .arg("-l") + .arg(p2.to_str().unwrap()) + .output() + .expect("run get v2"); + assert!( + out.status.success(), + "get v2 failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + let s2 = String::from_utf8_lossy(&out.stdout); + assert!(s2.contains("\"v2\""), "expected v2 triple, got: {}", s2); +} diff --git a/lib/Cargo.toml b/lib/Cargo.toml index ded9de6..6ded123 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -25,6 +25,7 @@ clap.workspace = true derive_builder.workspace = true serde.workspace = true oxigraph.workspace = true +rdf5d.workspace = true lazy_static = "1.4.0" serde_with = "3.7.0" tempfile = "3.10.1" diff --git a/lib/src/api.rs b/lib/src/api.rs index a2697df..03b4eb9 100644 --- a/lib/src/api.rs +++ b/lib/src/api.rs @@ -2,13 +2,16 @@ //! This includes loading, saving, updating, and querying the environment. use crate::config::Config; -use crate::ToUriString; -use crate::doctor::{ConflictingPrefixes, Doctor, DuplicateOntology, OntologyDeclaration, OntologyProblem}; +use crate::doctor::{ + ConflictingPrefixes, Doctor, DuplicateOntology, OntologyDeclaration, OntologyProblem, +}; use crate::environment::Environment; +use crate::options::{Overwrite, RefreshStrategy}; use crate::transform; +use crate::ToUriString; use crate::{EnvironmentStatus, FailedImport}; use chrono::prelude::*; -use oxigraph::model::{Dataset, Graph, NamedNode, NamedNodeRef, SubjectRef}; +use oxigraph::model::{Dataset, Graph, NamedNode, NamedNodeRef, NamedOrBlankNodeRef}; use oxigraph::store::Store; use petgraph::visit::EdgeRef; use std::io::{BufReader, Write}; @@ -18,11 +21,18 @@ use std::path::PathBuf; use crate::io::GraphIO; use crate::ontology::{GraphIdentifier, Ontology, OntologyLocation}; use anyhow::{anyhow, Result}; -use log::{error, info, warn}; +use log::{debug, error, info, warn}; use petgraph::graph::{Graph as DiGraph, NodeIndex}; use std::collections::{HashMap, HashSet, VecDeque}; use std::fs; +#[derive(Clone, Debug)] +struct PendingImport { + location: OntologyLocation, + overwrite: Overwrite, + required: bool, +} + /// Initializes logging for the ontoenv library. /// /// This function checks for the `ONTOENV_LOG` environment variable. If it is set, @@ -90,12 +100,100 @@ pub struct Stats { pub num_ontologies: usize, } +#[derive(Debug, Clone)] +pub enum ImportPaths { + Present(Vec>), + Missing { + importers: Vec>, + }, +} + +#[derive(Default)] +struct BatchState { + depth: usize, + seen_locations: HashSet, +} + +impl BatchState { + fn begin(&mut self) { + if self.depth == 0 { + self.seen_locations.clear(); + } + self.depth += 1; + } + + fn end(&mut self) { + self.depth = self.depth.saturating_sub(1); + } + + fn has_seen(&self, location: &OntologyLocation) -> bool { + self.seen_locations.contains(location) + } + + fn mark_seen(&mut self, location: &OntologyLocation) { + self.seen_locations.insert(location.clone()); + } +} + +struct BatchScope<'a> { + env: &'a mut OntoEnv, + completed: bool, +} + +impl<'a> BatchScope<'a> { + fn enter(env: &'a mut OntoEnv) -> Result { + env.batch_state.begin(); + if let Err(err) = env.io.begin_batch() { + env.batch_state.end(); + return Err(err); + } + Ok(Self { + env, + completed: false, + }) + } + + fn run(mut self, f: impl FnOnce(&mut OntoEnv) -> Result) -> Result { + let result = f(self.env); + let end_result = self.env.io.end_batch(); + self.env.batch_state.end(); + self.completed = true; + match (result, end_result) { + (Ok(value), Ok(())) => Ok(value), + (Ok(_), Err(err)) => Err(err), + (Err(err), Ok(())) => Err(err), + (Err(err), Err(end_err)) => { + error!("Failed to finalize batched RDF write: {end_err}"); + Err(err) + } + } + } +} + +impl<'a> Drop for BatchScope<'a> { + fn drop(&mut self) { + if self.completed { + return; + } + if let Err(err) = self.env.io.end_batch() { + error!("Failed to finalize batched RDF write: {err}"); + } + self.env.batch_state.end(); + } +} + +enum FetchOutcome { + Reused(GraphIdentifier), + Loaded(Ontology), +} + pub struct OntoEnv { env: Environment, io: Box, dependency_graph: DiGraph, config: Config, failed_resolutions: HashSet, + batch_state: BatchState, } impl std::fmt::Debug for OntoEnv { @@ -112,6 +210,7 @@ impl std::fmt::Debug for OntoEnv { } impl OntoEnv { + // Constructors fn new(env: Environment, io: Box, config: Config) -> Self { Self { env, @@ -119,6 +218,7 @@ impl OntoEnv { config, dependency_graph: DiGraph::new(), failed_resolutions: HashSet::new(), + batch_state: BatchState::default(), } } @@ -218,9 +318,7 @@ impl OntoEnv { } pub fn new_from_store(strict: bool, offline: bool, store: Store) -> Result { - let io = Box::new(crate::io::ExternalStoreGraphIO::new( - store, offline, strict, - )); + let io = Box::new(crate::io::ExternalStoreGraphIO::new(store, offline, strict)); let root = std::env::current_dir()?; let config = Config::builder() .root(root) @@ -236,10 +334,6 @@ impl OntoEnv { Ok(ontoenv) } - pub fn io(&self) -> &Box { - &self.io - } - /// returns the graph identifier for the given resolve target, if it exists pub fn resolve(&self, target: ResolveTarget) -> Option { match target { @@ -254,15 +348,6 @@ impl OntoEnv { } } - pub fn stats(&self) -> Result { - let store_stats = self.io.size()?; - Ok(Stats { - num_triples: store_stats.num_triples, - num_graphs: store_stats.num_graphs, - num_ontologies: self.env.ontologies().len(), - }) - } - /// Saves the current environment to the .ontoenv directory. pub fn save_to_directory(&self) -> Result<()> { if self.config.temporary { @@ -292,15 +377,6 @@ impl OntoEnv { Ok(()) } - pub fn flush(&mut self) -> Result<()> { - self.io.flush() - } - - /// Backwards-compatibility: update only changed/added files (same as update_all(false)) - pub fn update(&mut self) -> Result> { - self.update_all(false) - } - pub fn new_temporary(&self) -> Result { let io: Box = Box::new(crate::io::MemoryGraphIO::new( self.config.offline, @@ -343,6 +419,7 @@ impl OntoEnv { locations.insert(ontology.location().unwrap().clone(), ontology.id().clone()); } env.locations = locations; + env.rebuild_aliases(); // Initialize the IO to the persistent graph type. We know that it exists because we // are loading from a directory @@ -361,8 +438,10 @@ impl OntoEnv { // copy the graphs from the persistent store to the memory store if we are a 'temporary' // environment if config.temporary { - let mut new_io = - Box::new(crate::io::MemoryGraphIO::new(config.offline, config.strict)?); + let mut new_io = Box::new(crate::io::MemoryGraphIO::new( + config.offline, + config.strict, + )?); for ontology in env.ontologies().values() { let graph = io.get_graph(ontology.id())?; new_io.add_graph(ontology.id().clone(), graph)?; @@ -376,9 +455,40 @@ impl OntoEnv { config, dependency_graph, failed_resolutions: HashSet::new(), + batch_state: BatchState::default(), + }) + } + + // Core API methods + pub fn flush(&mut self) -> Result<()> { + self.io.flush() + } + + fn with_io_batch(&mut self, f: F) -> Result + where + F: FnOnce(&mut Self) -> Result, + { + BatchScope::enter(self)?.run(f) + } + + pub fn io(&self) -> &Box { + &self.io + } + + pub fn stats(&self) -> Result { + let store_stats = self.io.size()?; + Ok(Stats { + num_triples: store_stats.num_triples, + num_graphs: store_stats.num_graphs, + num_ontologies: self.env.ontologies().len(), }) } + /// Backwards-compatibility: update only changed/added files (same as update_all(false)) + pub fn update(&mut self) -> Result> { + self.update_all(false) + } + /// Calculates and returns the environment status pub fn status(&self) -> Result { // get time modified of the self.store_path() directory @@ -433,18 +543,19 @@ impl OntoEnv { metadata } - /// Initializes a new API environment. If the environment directory already exists: - /// - If `overwrite` is true, it will remove the existing directory and recreate it. - /// - If `overwrite` is false, it will return an error. - /// Returns a `Result` indicating success or failure + /// Initializes a new API environment based on `config`. + /// + /// For persistent environments (`config.temporary == false`), if the target `.ontoenv` + /// directory already exists this will remove and recreate it when `overwrite` is `true`, + /// otherwise it returns an error. Temporary environments never touch the filesystem, so + /// the `overwrite` flag is ignored. An initial discovery run is performed before the + /// environment is returned. pub fn init(config: Config, overwrite: bool) -> Result { let ontoenv_dir = config.root.join(".ontoenv"); if !config.temporary && ontoenv_dir.exists() { if overwrite { - info!( - "Directory exists and will be overwritten: {ontoenv_dir:?}" - ); + info!("Directory exists and will be overwritten: {ontoenv_dir:?}"); fs::remove_dir_all(&ontoenv_dir)?; } else { return Err(anyhow::anyhow!( @@ -460,7 +571,10 @@ impl OntoEnv { let env = Environment::new(); let io: Box = match config.temporary { - true => Box::new(crate::io::MemoryGraphIO::new(config.offline, config.strict)?), + true => Box::new(crate::io::MemoryGraphIO::new( + config.offline, + config.strict, + )?), false => Box::new(crate::io::PersistentGraphIO::new( ontoenv_dir, config.offline, @@ -474,6 +588,7 @@ impl OntoEnv { dependency_graph: DiGraph::new(), config, failed_resolutions: HashSet::new(), + batch_state: BatchState::default(), }; let _ = ontoenv.update_all(false)?; @@ -495,116 +610,192 @@ impl OntoEnv { /// Add the ontology from the given location to the environment, /// then add it to the dependency graph. + /// + /// * `overwrite` selects whether an existing graph at the same identifier should be replaced. + /// * `refresh` controls whether cached metadata may be reused (`RefreshStrategy::UseCache`) or + /// the source should always be fetched (`RefreshStrategy::Force`). pub fn add( &mut self, location: OntologyLocation, - overwrite: bool, + overwrite: Overwrite, + refresh: RefreshStrategy, ) -> Result { - self.failed_resolutions.clear(); - let ont = self.io.add(location, overwrite)?; - let id = ont.id().clone(); - self.env.add_ontology(ont); - self.add_ids_to_dependency_graph(vec![id.clone()])?; - self.save_to_directory()?; - Ok(id) + self.add_with_options(location, overwrite, refresh, true) } /// Add the ontology from the given location to the environment, but do not /// explore its owl:imports. It will be added to the dependency graph and /// edges will be created if its imports are already present in the environment. + /// Parameters mirror [`OntoEnv::add`] for overwrite and refresh behavior. pub fn add_no_imports( &mut self, location: OntologyLocation, - overwrite: bool, + overwrite: Overwrite, + refresh: RefreshStrategy, ) -> Result { - self.failed_resolutions.clear(); - let ont = self.io.add(location, overwrite)?; - let id = ont.id().clone(); - self.env.add_ontology(ont); - self.add_ids_to_dependency_graph(vec![])?; + self.add_with_options(location, overwrite, refresh, false) + } + + fn add_with_options( + &mut self, + location: OntologyLocation, + overwrite: Overwrite, + refresh: RefreshStrategy, + update_dependencies: bool, + ) -> Result { + self.with_io_batch(move |env| { + env.add_with_options_inner(location, overwrite, refresh, update_dependencies) + }) + } + + fn fetch_location( + &mut self, + location: OntologyLocation, + overwrite: Overwrite, + refresh: RefreshStrategy, + ) -> Result { + if let Some(existing_id) = self.try_reuse_cached(&location, refresh)? { + self.batch_state.mark_seen(&location); + return Ok(FetchOutcome::Reused(existing_id)); + } + + if !refresh.is_force() && self.batch_state.has_seen(&location) { + if let Some(existing) = self.env.get_ontology_by_location(&location) { + return Ok(FetchOutcome::Reused(existing.id().clone())); + } + } + + let ontology = self.io.add(location.clone(), overwrite)?; + self.batch_state.mark_seen(&location); + Ok(FetchOutcome::Loaded(ontology)) + } + + fn register_ontologies( + &mut self, + ontologies: Vec, + update_dependencies: bool, + ) -> Result> { + let mut ids = Vec::with_capacity(ontologies.len()); + for ontology in ontologies { + let id = ontology.id().clone(); + self.env.add_ontology(ontology)?; + ids.push(id); + } + + if update_dependencies && !ids.is_empty() { + self.add_ids_to_dependency_graph(ids.clone())?; + } + self.save_to_directory()?; - Ok(id) + Ok(ids) } - /// Load all graphs from the search directories. There are several things that can happen: - /// - /// 1. files have been added from the search directories - /// 2. files have been removed from the search directories - /// 3. files have been updated in the search directories - /// - /// OntoEnv tries to do the least amount of work possible. - /// - /// First, it removes all ontologies which no longer appear in the search directories; it uses - /// its internal index of ontologies to do this search. - /// - /// Next, it determines what new files have been added to the search directories. These are - /// files whose locations do not appear in the internal ontology index. It also finds the files - /// in the internal ontology index have been updated. It does this by comparing the last - /// updated time of the file with the last updated time of the ontology in the index. - /// - /// Then, it reads all the new and updated files and adds them to the environment. - /// - /// Finally, it updates the dependency graph for all the updated ontologies. - pub fn update_all(&mut self, all: bool) -> Result> { + fn add_with_options_inner( + &mut self, + location: OntologyLocation, + overwrite: Overwrite, + refresh: RefreshStrategy, + update_dependencies: bool, + ) -> Result { self.failed_resolutions.clear(); - // remove ontologies which are no longer present in the search directories - // remove ontologies which are no longer present in the search directories - for graphid in self.missing_ontologies() { - self.io.remove(&graphid)?; - self.env.remove_ontology(&graphid); + let seeds = vec![(location.clone(), overwrite)]; + let (ontologies, reused_ids, errors) = + self.process_import_queue(seeds, refresh, update_dependencies)?; + let mut ids = self.register_ontologies(ontologies, update_dependencies)?; + ids.extend(reused_ids); + + if let Some(existing) = self.env.get_ontology_by_location(&location) { + return Ok(existing.id().clone()); } - // now, find all the new and updated ontologies in the search directories - // and add them to the environment - let updated_files: Vec = if all { - let mut set: HashSet = self - .env - .ontologies() - .values() - .filter_map(|o| o.location().cloned()) - .collect(); - for loc in self.find_files()? { - set.insert(loc); + ids.into_iter().next().ok_or_else(|| { + let mut base = format!( + "Failed to add ontology for location {}", + location.to_string() + ); + if !errors.is_empty() { + base.push_str(": "); + base.push_str(&errors.join("; ")); } - set.into_iter().collect() - } else { - self.get_updated_locations()? + anyhow!(base) + }) + } + + fn try_reuse_cached( + &self, + location: &OntologyLocation, + refresh: RefreshStrategy, + ) -> Result> { + if !self.config.use_cached_ontologies.is_enabled() { + return Ok(None); + } + let existing = match self.env.get_ontology_by_location(location) { + Some(ontology) => ontology, + None => return Ok(None), }; - // load all of these files into the environment - let mut ontologies: Vec = vec![]; - for location in updated_files { - // if 'strict' mode then fail on any errors when adding the ontology - // otherwise just warn + let existing_id = existing.id().clone(); - let result = self.io.add(location.clone(), true); - if result.is_err() { - if self.config.strict { - return Err(result.unwrap_err()); - } else { + if refresh.is_force() { + return Ok(None); + } + + if location.is_file() { + let last_updated = match existing.last_updated { + Some(ts) => ts, + None => return Ok(None), + }; + + match self.io.source_last_modified(existing.id()) { + Ok(source_modified) => { + if source_modified <= last_updated { + return Ok(Some(existing_id)); + } + } + Err(err) => { warn!( - "Failed to read ontology file {}: {}", - location, - result.unwrap_err() + "Failed to determine modification time for {} ({}); using cached version", + existing_id, err ); - continue; + return Ok(Some(existing_id)); } } - let new_ont = result.unwrap(); - ontologies.push(new_ont); + Ok(None) + } else { + // For URLs, reuse the cached ontology unless the caller forces a refresh + Ok(Some(existing_id)) } + } - let mut update_ids: Vec = Vec::new(); - // add the ontologies to the environment - for ontology in ontologies { - let id = ontology.id().clone(); - self.env.add_ontology(ontology); - update_ids.push(id); - } - self.add_ids_to_dependency_graph(update_ids.clone())?; - self.save_to_directory()?; - Ok(update_ids) + /// Loads or refreshes graphs discovered in the configured search directories. + /// + /// When `all` is `false`, only new or modified ontology sources are reparsed. When `all` + /// is `true`, every known ontology location is reprocessed regardless of timestamps, + /// allowing callers to force a fresh ingest of all content. + /// + /// The workflow removes ontologies whose sources disappeared, detects additions and + /// updates by comparing on-disk content with the stored copy, ingests changed files, and + /// finally refreshes the dependency graph for the affected ontologies. + pub fn update_all(&mut self, all: bool) -> Result> { + self.with_io_batch(move |env| env.update_all_inner(all)) + } + + fn update_all_inner(&mut self, all: bool) -> Result> { + self.failed_resolutions.clear(); + self.remove_missing_ontologies()?; + + let updated_files = self.collect_updated_files(all)?; + let seeds: Vec<(OntologyLocation, Overwrite)> = updated_files + .into_iter() + .map(|loc| (loc, Overwrite::Allow)) + .collect(); + let (ontologies, reused_ids, _errors) = + self.process_import_queue(seeds, RefreshStrategy::UseCache, true)?; + + let mut ids = self.register_ontologies(ontologies, true)?; + ids.extend(reused_ids); + Ok(ids) } /// Returns a list of all ontologies from the environment which have been updated. @@ -668,6 +859,150 @@ impl OntoEnv { .collect() } + fn remove_missing_ontologies(&mut self) -> Result<()> { + for graphid in self.missing_ontologies() { + self.io.remove(&graphid)?; + self.env.remove_ontology(&graphid)?; + } + Ok(()) + } + + fn collect_updated_files(&mut self, all: bool) -> Result> { + if all { + let mut set: HashSet = self + .env + .ontologies() + .values() + .filter_map(|o| o.location().cloned()) + .collect(); + for loc in self.find_files()? { + set.insert(loc); + } + Ok(set.into_iter().collect()) + } else { + self.get_updated_locations() + } + } + + fn process_import_queue( + &mut self, + seeds: Vec<(OntologyLocation, Overwrite)>, + refresh: RefreshStrategy, + include_imports: bool, + ) -> Result<(Vec, Vec, Vec)> { + let strict = self.config.strict; + let mut queue: VecDeque = seeds + .into_iter() + .map(|(location, overwrite)| PendingImport { + location, + overwrite, + required: strict, + }) + .collect(); + let mut seen: HashSet = HashSet::new(); + let mut fetched: Vec = Vec::new(); + let mut touched_ids: Vec = Vec::new(); + let mut touched_set: HashSet = HashSet::new(); + let mut errors: Vec = Vec::new(); + + let mut record_id = |id: &GraphIdentifier| { + if touched_set.insert(id.clone()) { + touched_ids.push(id.clone()); + } + }; + + while let Some(job) = queue.pop_front() { + if !seen.insert(job.location.clone()) { + continue; + } + + match self.fetch_location(job.location.clone(), job.overwrite, refresh) { + Ok(FetchOutcome::Loaded(ontology)) => { + let imports = ontology.imports.clone(); + let id = ontology.id().clone(); + if include_imports { + for import in imports { + self.queue_import_location(&import, &mut queue, self.config.strict)?; + } + } + fetched.push(ontology); + record_id(&id); + } + Ok(FetchOutcome::Reused(id)) => { + record_id(&id); + if include_imports { + if let Ok(existing) = self.get_ontology(&id) { + for import in existing.imports { + self.queue_import_location( + &import, + &mut queue, + self.config.strict, + )?; + } + } + } + } + Err(err) => { + let err_str = err.to_string(); + let enriched = format!("Failed to load ontology {}: {}", job.location, err_str); + if job.required { + return Err(anyhow!(enriched)); + } + warn!("{}", enriched); + errors.push(enriched); + if let OntologyLocation::Url(url) = &job.location { + if let Ok(node) = NamedNode::new(url.clone()) { + self.failed_resolutions.insert(node); + } + } + } + } + } + + Ok((fetched, touched_ids, errors)) + } + + fn queue_import_location( + &mut self, + import: &NamedNode, + queue: &mut VecDeque, + strict: bool, + ) -> Result<()> { + let iri = import.as_str(); + let is_fetchable = + iri.starts_with("http://") || iri.starts_with("https://") || iri.starts_with("file://"); + if !is_fetchable { + return Ok(()); + } + + if let Some(existing) = self.env.get_ontology_by_name(import.into()) { + if let Some(loc) = existing.location() { + queue.push_back(PendingImport { + location: loc.clone(), + overwrite: Overwrite::Preserve, + required: strict, + }); + return Ok(()); + } + } + + match OntologyLocation::from_str(iri) { + Ok(loc) => queue.push_back(PendingImport { + location: loc, + overwrite: Overwrite::Preserve, + required: strict, + }), + Err(err) => { + self.failed_resolutions.insert(import.clone()); + if strict { + return Err(err); + } + warn!("Failed to resolve location for import {}: {}", import, err); + } + } + Ok(()) + } + /// Returns a list of all files in the environment which have been updated (added or changed) /// Does not return files that have been removed pub fn get_updated_locations(&self) -> Result> { @@ -769,7 +1104,7 @@ impl OntoEnv { let mut seen: HashSet = HashSet::new(); while let Some(graphid) = stack.pop_front() { - info!("Building dependency graph for: {graphid:?}"); + debug!("Building dependency graph for: {graphid:?}"); if seen.contains(&graphid) { continue; } @@ -821,10 +1156,10 @@ impl OntoEnv { } }; - match self.io.add(location, false) { + match self.io.add(location, Overwrite::Preserve) { Ok(new_ont) => { let id = new_ont.id().clone(); - self.env.add_ontology(new_ont); + self.env.add_ontology(new_ont)?; stack.push_back(id); } Err(e) => { @@ -850,7 +1185,10 @@ impl OntoEnv { // traverse the ontologies and add edges to the graph for ontology in self.env.ontologies().keys() { let index = indexes.get(ontology).ok_or_else(|| { - anyhow!("Programming error: ontology id {:?} not in index map", ontology) + anyhow!( + "Programming error: ontology id {:?} not in index map", + ontology + ) })?; let ont = match self.env.ontologies().get(ontology) { Some(ont) => ont, @@ -893,7 +1231,12 @@ impl OntoEnv { doctor.run(self) } - /// Returns the names of all graphs within the dependency closure of the provided graph + /// Returns the dependency closure for the provided graph identifier. + /// + /// The returned vector contains `GraphIdentifier`s, with the requested identifier inserted + /// at the front followed by its resolved imports. If `recursion_depth` is non-negative, + /// traversal stops once that depth is reached. In strict mode an unresolved import results + /// in an error; otherwise the missing import is logged and skipped. pub fn get_closure( &self, id: &GraphIdentifier, @@ -914,9 +1257,10 @@ impl OntoEnv { continue; } - let ontology = self.ontologies().get(&graph).ok_or_else(|| { - anyhow!("Ontology {} not found", graph.to_uri_string()) - })?; + let ontology = self + .ontologies() + .get(&graph) + .ok_or_else(|| anyhow!("Ontology {} not found", graph.to_uri_string()))?; for import in &ontology.imports { // get graph identifier for import let import = match self.env.get_ontology_by_name(import.into()) { @@ -960,7 +1304,7 @@ impl OntoEnv { let first_id = graph_ids .first() .ok_or_else(|| anyhow!("No graphs found"))?; - let root_ontology: SubjectRef = SubjectRef::NamedNode(first_id.name()); + let root_ontology = NamedOrBlankNodeRef::NamedNode(first_id.name()); let mut namespace_map = HashMap::new(); for graph_id in &graph_ids { @@ -976,7 +1320,7 @@ impl OntoEnv { // Rewrite sh:prefixes // defaults to true if not specified if rewrite_sh_prefixes.unwrap_or(true) { - transform::rewrite_sh_prefixes(&mut dataset, root_ontology); + transform::rewrite_sh_prefixes_dataset(&mut dataset, root_ontology); } // remove owl:imports if remove_owl_imports.unwrap_or(true) { @@ -1026,6 +1370,107 @@ impl OntoEnv { Ok(importers) } + /// Returns all importer paths that terminate at the given ontology. + /// Each path is ordered from the most distant importer down to `id`. + pub fn get_import_paths(&self, id: &NamedNode) -> Result>> { + match self.explain_import(id)? { + ImportPaths::Present(paths) => Ok(paths), + ImportPaths::Missing { .. } => Err(anyhow!("Ontology not found")), + } + } + + pub fn explain_import(&self, id: &NamedNode) -> Result { + if let Some(target) = self.env.get_ontology_by_name(id.into()) { + let idx = self + .dependency_graph + .node_indices() + .find(|i| self.dependency_graph[*i] == *target.id()) + .ok_or_else(|| anyhow!("Node not found"))?; + return Ok(ImportPaths::Present( + self.collect_import_paths_from_index(idx), + )); + } + + let mut importers = Vec::new(); + for ontology in self.env.ontologies().values() { + if ontology.imports.iter().any(|imp| imp == id) { + importers.push(ontology.id().clone()); + } + } + + if importers.is_empty() { + return Ok(ImportPaths::Missing { + importers: Vec::new(), + }); + } + + let mut paths: Vec> = Vec::new(); + for importer in importers { + let maybe_idx = self + .dependency_graph + .node_indices() + .find(|i| self.dependency_graph[*i] == importer); + if let Some(idx) = maybe_idx { + let mut importer_paths = self.collect_import_paths_from_index(idx); + paths.append(&mut importer_paths); + } else { + paths.push(vec![importer.clone()]); + } + } + + Ok(ImportPaths::Missing { importers: paths }) + } + + fn collect_import_paths_from_index( + &self, + target_idx: petgraph::graph::NodeIndex, + ) -> Vec> { + let mut results: Vec> = Vec::new(); + let mut path: Vec = Vec::new(); + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + + fn dfs( + g: &petgraph::Graph, + idx: petgraph::graph::NodeIndex, + path: &mut Vec, + seen: &mut std::collections::HashSet, + results: &mut Vec>, + ) { + let current = g[idx].clone(); + if !seen.insert(current.clone()) { + return; + } + path.push(current.clone()); + + let mut incoming = g + .neighbors_directed(idx, petgraph::Direction::Incoming) + .detach(); + + let mut has_incoming = false; + while let Some((_, src)) = incoming.next(g) { + has_incoming = true; + dfs(g, src, path, seen, results); + } + if !has_incoming { + let mut p = path.clone(); + p.reverse(); + results.push(p); + } + + path.pop(); + seen.remove(¤t); + } + + dfs( + &self.dependency_graph, + target_idx, + &mut path, + &mut seen, + &mut results, + ); + results + } + /// Returns the GraphViz dot representation of the dependency graph pub fn dep_graph_to_dot(&self) -> Result { self.rooted_dep_graph_to_dot(self.ontologies().keys().cloned().collect()) @@ -1049,17 +1494,12 @@ impl OntoEnv { let ont = self .ontologies() .get(&ontology) - .ok_or_else(|| { - anyhow!( - "Listing ontologies: Ontology {} not found", - ontology - ) - })?; + .ok_or_else(|| anyhow!("Listing ontologies: Ontology {} not found", ontology))?; for import in &ont.imports { let import = match self.env.get_ontology_by_name(import.into()) { Some(imp) => imp.id().clone(), None => { - error!("Import not found: {import}"); + warn!("Import not found: {import}"); continue; } }; @@ -1106,7 +1546,7 @@ impl OntoEnv { let g = match self.io.get_graph(ontology.id()) { Ok(g) => g, Err(e) => { - error!("Could not get graph for {}: {e}", ontology.id()); + warn!("Could not get graph for {}: {e}", ontology.id()); continue; } }; diff --git a/lib/src/config.rs b/lib/src/config.rs index 6861644..9a23dc4 100644 --- a/lib/src/config.rs +++ b/lib/src/config.rs @@ -1,6 +1,7 @@ //! Defines the configuration structures for the OntoEnv environment. //! This includes the main `Config` struct and related structs for ontology locations and environment setup. +use crate::options::CacheMode; use crate::policy::{DefaultPolicy, ResolutionPolicy}; use anyhow::Result; use glob::{Pattern, PatternError}; @@ -8,6 +9,8 @@ use serde::{Deserialize, Serialize}; use std::io::{BufReader, Write}; use std::path::{Path, PathBuf}; +const DEFAULT_INCLUDE_PATTERNS: &[&str] = &["*.ttl", "*.xml", "*.n3"]; + fn vec_pattern_ser(patterns: &Vec, serializer: S) -> Result where S: serde::Serializer, @@ -28,6 +31,21 @@ where patterns.map_err(serde::de::Error::custom) } +fn cache_mode_ser(mode: &CacheMode, serializer: S) -> Result +where + S: serde::Serializer, +{ + serializer.serialize_bool(mode.is_enabled()) +} + +fn cache_mode_de<'de, D>(deserializer: D) -> Result +where + D: serde::Deserializer<'de>, +{ + let value = bool::deserialize(deserializer)?; + Ok(CacheMode::from(value)) +} + #[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] pub struct Config { pub root: PathBuf, @@ -53,6 +71,12 @@ pub struct Config { pub offline: bool, // resolution policy pub resolution_policy: String, + #[serde( + default, + serialize_with = "cache_mode_ser", + deserialize_with = "cache_mode_de" + )] + pub use_cached_ontologies: CacheMode, // if true, do not store the ontoenv store on disk pub temporary: bool, // if true, do not search for ontologies in the search directories @@ -135,6 +159,10 @@ impl Config { println!(" Require Ontology Names: {}", self.require_ontology_names); println!(" Strict: {}", self.strict); println!(" Offline: {}", self.offline); + println!( + " Use Cached Ontologies: {}", + self.use_cached_ontologies.is_enabled() + ); println!(" Resolution Policy: {}", self.resolution_policy); println!(" Temporary: {}", self.temporary); println!(" No Search: {}", self.no_search); @@ -153,6 +181,7 @@ pub struct ConfigBuilder { resolution_policy: Option, no_search: bool, temporary: Option, + use_cached_ontologies: Option, } impl ConfigBuilder { @@ -169,6 +198,7 @@ impl ConfigBuilder { resolution_policy: None, no_search: false, temporary: None, + use_cached_ontologies: None, } } @@ -234,6 +264,12 @@ impl ConfigBuilder { self } + /// Sets whether to reuse cached ontologies when possible. Defaults to disabled. + pub fn use_cached_ontologies(mut self, mode: CacheMode) -> Self { + self.use_cached_ontologies = Some(mode); + self + } + /// Sets the resolution policy. Defaults to `"default"`. pub fn resolution_policy(mut self, policy: String) -> Self { self.resolution_policy = Some(policy); @@ -271,11 +307,10 @@ impl ConfigBuilder { }; let includes_str = self.includes.unwrap_or_else(|| { - vec![ - "*.ttl".to_string(), - "*.xml".to_string(), - "*.n3".to_string(), - ] + DEFAULT_INCLUDE_PATTERNS + .iter() + .map(|s| s.to_string()) + .collect() }); let excludes_str = self.excludes.unwrap_or_default(); @@ -300,6 +335,7 @@ impl ConfigBuilder { resolution_policy: self .resolution_policy .unwrap_or_else(|| DefaultPolicy.policy_name().to_string()), + use_cached_ontologies: self.use_cached_ontologies.unwrap_or_default(), temporary: self.temporary.unwrap_or(false), no_search: self.no_search, }) diff --git a/lib/src/environment.rs b/lib/src/environment.rs index 8442c43..86d7960 100644 --- a/lib/src/environment.rs +++ b/lib/src/environment.rs @@ -4,9 +4,10 @@ use crate::io::GraphIO; use crate::ontology::{GraphIdentifier, Ontology, OntologyLocation}; use crate::policy; -use anyhow::Result; +use anyhow::{anyhow, Result}; use chrono::prelude::*; -use oxigraph::model::{Graph, NamedNodeRef}; +use log::warn; +use oxigraph::model::{Graph, NamedNode, NamedNodeRef}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; @@ -34,8 +35,8 @@ where Ok(map) } -/// A struct that holds the ontology environment: all the mappings -/// between ontology names and their respective graph identifiers and locations. +/// Represents the loaded ontology environment, including ontologies, their source +/// locations, normalized aliases, and the default resolution policy. #[derive(Debug, Serialize, Deserialize)] pub struct Environment { #[serde(serialize_with = "ontologies_ser", deserialize_with = "ontologies_de")] @@ -47,6 +48,8 @@ pub struct Environment { default_policy: Box, #[serde(skip)] pub locations: HashMap, + #[serde(default)] + aliases: HashMap, } impl Clone for Environment { @@ -54,6 +57,7 @@ impl Clone for Environment { Self { ontologies: self.ontologies.clone(), locations: self.locations.clone(), + aliases: self.aliases.clone(), default_policy: policy::policy_from_name(self.default_policy.policy_name()) .expect("Failed to clone policy"), } @@ -67,11 +71,17 @@ impl Default for Environment { } impl Environment { + fn normalize_name(s: &str) -> &str { + let trimmed_hash = s.trim_end_matches('#'); + trimmed_hash.trim_end_matches('/') + } + pub fn new() -> Self { Self { ontologies: HashMap::new(), default_policy: Box::new(policy::DefaultPolicy), locations: HashMap::new(), + aliases: HashMap::new(), } } @@ -79,17 +89,30 @@ impl Environment { &self.ontologies } - pub fn add_ontology(&mut self, mut ontology: Ontology) { + pub fn add_ontology(&mut self, mut ontology: Ontology) -> Result<()> { ontology.last_updated = Some(Utc::now()); - self.locations - .insert(ontology.location().unwrap().clone(), ontology.id().clone()); - self.ontologies.insert(ontology.id().clone(), ontology); + let location = ontology + .location() + .cloned() + .ok_or_else(|| anyhow!("Cannot add ontology {} without a location", ontology.id()))?; + let ontology_id = ontology.id().clone(); + let ontology_name = ontology.name(); + self.locations.insert(location.clone(), ontology_id.clone()); + self.register_alias(&location, &ontology_id, &ontology_name); + self.ontologies.insert(ontology_id, ontology); + Ok(()) } - pub fn remove_ontology(&mut self, id: &GraphIdentifier) -> Option { - self.locations - .remove(self.ontologies.get(id)?.location().unwrap()); - self.ontologies.remove(id) + pub fn remove_ontology(&mut self, id: &GraphIdentifier) -> Result> { + if let Some(existing) = self.ontologies.get(id) { + if let Some(location) = existing.location() { + self.locations.remove(location); + } else { + warn!("Removing ontology {} without recorded location", id); + } + self.aliases.retain(|_, value| value != id); + } + Ok(self.ontologies.remove(id)) } pub fn get_modified_time(&self, id: &GraphIdentifier) -> Option> { @@ -102,13 +125,12 @@ impl Environment { self.locations.get(location) } - /// Returns an Ontology with the given id using the default policy + /// Returns a cloned `Ontology` for the provided identifier using the default resolution policy. pub fn get_ontology(&self, id: &GraphIdentifier) -> Option { self.get_ontology_with_policy(id.into(), &*self.default_policy) } - /// Returns an Ontology with the given name. Uses the provided policy to resolve - /// the ontology if there are multiple ontologies with the same name. + /// Returns a cloned `Ontology` with the given name, resolving conflicts with the supplied policy. pub fn get_ontology_with_policy( &self, name: NamedNodeRef, @@ -120,15 +142,22 @@ impl Environment { .cloned() } - /// Returns the first ontology with the given name + /// Returns the first ontology whose name (or registered alias) matches the supplied value. pub fn get_ontology_by_name(&self, name: NamedNodeRef) -> Option<&Ontology> { - // choose the first ontology with the given name - self.ontologies - .values() - .find(|&ontology| ontology.name() == name) + let target = Self::normalize_name(name.as_str()); + if let Some(id) = self.aliases.get(target) { + if let Some(ontology) = self.ontologies.get(id) { + return Some(ontology); + } + } + self.ontologies.values().find(|ontology| { + let binding = ontology.name(); + let candidate = Self::normalize_name(binding.as_str()); + candidate == target + }) } - /// Returns the first graph with the given name + /// Returns the graph associated with the given name (respecting aliases) using the provided I/O backend. pub fn get_graph_by_name(&self, name: NamedNodeRef, store: impl GraphIO) -> Result { let ontology = self .get_ontology_by_name(name) @@ -138,9 +167,39 @@ impl Environment { /// Returns the first ontology with the given location pub fn get_ontology_by_location(&self, location: &OntologyLocation) -> Option<&Ontology> { - // choose the first ontology with the given location - self.ontologies - .values() - .find(|&ontology| ontology.location() == Some(location)) + let id = self.locations.get(location)?; + self.ontologies.get(id) + } + + fn register_alias( + &mut self, + location: &OntologyLocation, + ontology_id: &GraphIdentifier, + ontology_name: &NamedNode, + ) { + if let OntologyLocation::Url(url) = location { + if let Ok(loc_node) = NamedNode::new(url.clone()) { + let loc_norm = Self::normalize_name(loc_node.as_str()).to_string(); + let name_norm = Self::normalize_name(ontology_name.as_str()); + if loc_norm != name_norm { + self.aliases.insert(loc_norm, ontology_id.clone()); + } else { + self.aliases.remove(&loc_norm); + } + } + } + } + + pub fn rebuild_aliases(&mut self) { + self.aliases.clear(); + let mut alias_data: Vec<(OntologyLocation, GraphIdentifier, NamedNode)> = Vec::new(); + for ontology in self.ontologies.values() { + if let Some(location) = ontology.location() { + alias_data.push((location.clone(), ontology.id().clone(), ontology.name())); + } + } + for (location, ontology_id, ontology_name) in alias_data { + self.register_alias(&location, &ontology_id, &ontology_name); + } } } diff --git a/lib/src/fetch.rs b/lib/src/fetch.rs new file mode 100644 index 0000000..168500e --- /dev/null +++ b/lib/src/fetch.rs @@ -0,0 +1,426 @@ +use crate::errors::OfflineRetrievalError; +use anyhow::{anyhow, Result}; +use chrono::prelude::*; +use oxigraph::io::{JsonLdProfileSet, RdfFormat, RdfParser}; +use reqwest::blocking::Client; +use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, CONTENT_TYPE, LINK}; +use std::io::Cursor; +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct FetchOptions { + pub offline: bool, + pub timeout: Duration, + pub accept_order: Vec<&'static str>, + pub extension_candidates: Vec<&'static str>, +} + +impl Default for FetchOptions { + fn default() -> Self { + Self { + offline: false, + timeout: Duration::from_secs(30), + accept_order: vec![ + "text/turtle", + "application/rdf+xml", + "application/ld+json", + "application/n-triples", + ], + extension_candidates: vec![ + ".ttl", + ".rdf", + ".owl", + ".rdf.xml", + ".owl.xml", + ".xml", + ".jsonld", + ".nt", + ".nq", + "index.ttl", + "index.rdf", + "index.rdf.xml", + "index.owl.xml", + "index.xml", + "index.jsonld", + ], + } + } +} + +#[derive(Debug, Clone)] +pub struct FetchResult { + pub bytes: Vec, + pub format: Option, + pub final_url: String, + pub content_type: Option, +} + +fn detect_format(ct: &str) -> Option { + let ct = ct + .split(';') + .next() + .unwrap_or("") + .trim() + .to_ascii_lowercase(); + match ct.as_str() { + "text/turtle" | "application/x-turtle" => Some(RdfFormat::Turtle), + "application/rdf+xml" => Some(RdfFormat::RdfXml), + "application/n-triples" | "application/ntriples" | "text/plain" => { + Some(RdfFormat::NTriples) + } + _ => None, + } +} + +fn detect_format_from_url(url: &str) -> Option { + let trimmed = url.split('#').next().unwrap_or(url); + let path = trimmed.split('?').next().unwrap_or(trimmed); + std::path::Path::new(path) + .extension() + .and_then(|ext| ext.to_str()) + .map(|ext| ext.to_ascii_lowercase()) + .and_then(|ext| match ext.as_str() { + "ttl" => Some(RdfFormat::Turtle), + "rdf" | "owl" | "xml" => Some(RdfFormat::RdfXml), + "nt" => Some(RdfFormat::NTriples), + "jsonld" | "json" => Some(RdfFormat::JsonLd { + profile: JsonLdProfileSet::default(), + }), + "nq" | "trig" => Some(RdfFormat::NQuads), + _ => None, + }) +} + +fn build_accept(accept_order: &[&'static str]) -> String { + if accept_order.is_empty() { + return "*/*".to_string(); + } + let mut parts = Vec::new(); + let mut q = 1.0f32; + for (i, t) in accept_order.iter().enumerate() { + parts.push(format!("{t}; q={:.1}", q)); + let next = 1.0f32 - 0.1f32 * (i as f32 + 1.0f32); + q = if next < 0.1 { 0.1 } else { next }; + } + parts.push("*/*; q=0.1".to_string()); + parts.join(", ") +} + +fn build_extension_candidates(orig: &str, exts: &[&str]) -> Vec { + let mut cands = Vec::new(); + if orig.ends_with('/') { + for e in exts { + cands.push(format!("{orig}{e}")); + } + return cands; + } + // split path + let slash_pos = orig.rfind('/').map(|i| i + 1).unwrap_or(0); + let (prefix, filename) = orig.split_at(slash_pos); + if let Some(dot) = filename.rfind('.') { + let stem = &filename[..dot]; + let base = format!("{prefix}{stem}"); + for rep in [ + ".ttl", ".rdf", ".owl", ".rdf.xml", ".owl.xml", ".xml", ".jsonld", ".nt", ".nq", + ] { + cands.push(format!("{base}{rep}")); + } + } else { + for rep in [ + ".ttl", ".rdf", ".owl", ".rdf.xml", ".owl.xml", ".xml", ".jsonld", ".nt", ".nq", + ] { + cands.push(format!("{orig}{rep}")); + } + } + cands +} + +fn parse_link_alternates(headers: &HeaderMap, accept_order: &[&'static str]) -> Vec { + let mut out = Vec::new(); + if let Some(link_val) = headers.get(LINK) { + if let Ok(link_str) = link_val.to_str() { + for part in link_str.split(',') { + let part = part.trim(); + if !part.contains("rel=\"alternate\"") { + continue; + } + // Try to extract type and URL + let has_rdf_type = accept_order + .iter() + .any(|typ| part.contains(&format!("type=\"{}\"", typ))); + if !has_rdf_type { + continue; + } + if let Some(start) = part.find('<') { + if let Some(end) = part[start + 1..].find('>') { + let url = &part[start + 1..start + 1 + end]; + out.push(url.to_string()); + } + } + } + } + } + out +} + +fn try_get( + url: &str, + client: &Client, + accept: &str, +) -> Result<( + Vec, + Option, + Option, + String, + reqwest::StatusCode, +)> { + let resp = client.get(url).header(ACCEPT, accept).send()?; + let status = resp.status(); + let final_url = resp.url().to_string(); + let ct = resp + .headers() + .get(CONTENT_TYPE) + .and_then(|h| h.to_str().ok()) + .map(|s| s.to_string()); + let link = resp + .headers() + .get(LINK) + .and_then(|h| h.to_str().ok()) + .map(|s| s.to_string()); + let bytes = resp.bytes()?.to_vec(); + Ok((bytes, ct, link, final_url, status)) +} + +fn sniff_format(bytes: &[u8]) -> Option { + let sample_len = bytes.len().min(4096); + let sample = std::str::from_utf8(&bytes[..sample_len]).ok()?; + let trimmed = sample.trim_start(); + + if trimmed.starts_with('{') && sample.contains("\"@context\"") { + return Some(RdfFormat::JsonLd { + profile: JsonLdProfileSet::default(), + }); + } + if trimmed.starts_with('<') { + if sample.contains(" bool { + let cursor = Cursor::new(bytes); + let parser = RdfParser::from_format(format); + let mut reader = parser.for_reader(cursor); + while let Some(result) = reader.next() { + match result { + Ok(_) => continue, + Err(_) => return false, + } + } + true +} + +fn try_parse_candidates(bytes: &[u8]) -> Option { + let candidates = [ + RdfFormat::Turtle, + RdfFormat::RdfXml, + RdfFormat::NTriples, + RdfFormat::NQuads, + RdfFormat::TriG, + RdfFormat::JsonLd { + profile: JsonLdProfileSet::default(), + }, + ]; + for fmt in candidates { + if can_parse_as(bytes, fmt) { + return Some(fmt); + } + } + None +} + +fn is_generic_content_type(ct: Option<&str>) -> bool { + match ct.map(|s| s.to_ascii_lowercase()) { + None => true, + Some(ref s) if s.contains("text/plain") => true, + Some(ref s) if s.contains("application/octet-stream") => true, + Some(ref s) if s.contains("text/html") => true, + Some(ref s) if s.contains("application/xhtml") => true, + _ => false, + } +} + +pub fn fetch_rdf(url: &str, opts: &FetchOptions) -> Result { + if opts.offline { + return Err(anyhow!(OfflineRetrievalError { + file: url.to_string() + })); + } + let client = Client::builder().timeout(opts.timeout).build()?; + let accept = build_accept(&opts.accept_order); + + // First attempt + let (bytes, ct, link, final_url, status) = try_get(url, &client, &accept)?; + let mut content_type = ct.clone(); + + // Best-effort extra HEAD probe to refine content type if needed + if is_generic_content_type(content_type.as_deref()) { + if let Ok(resp) = client.head(&final_url).header(ACCEPT, &accept).send() { + if resp.status().is_success() { + if let Some(ct_head) = resp + .headers() + .get(CONTENT_TYPE) + .and_then(|h| h.to_str().ok()) + { + content_type = Some(ct_head.to_string()); + } + } + } + } + + // If success evaluate heuristics + if status.is_success() { + if let Some(fmt) = content_type + .as_deref() + .and_then(detect_format) + .or_else(|| detect_format_from_url(&final_url)) + .or_else(|| sniff_format(&bytes)) + { + return Ok(FetchResult { + bytes, + format: Some(fmt), + final_url, + content_type, + }); + } + + if let Some(fmt) = try_parse_candidates(&bytes) { + return Ok(FetchResult { + bytes, + format: Some(fmt), + final_url, + content_type, + }); + } + } + + // Try Link: rel="alternate" with single pass + if let Some(link_header) = link { + let mut headers = HeaderMap::new(); + headers.insert( + LINK, + HeaderValue::from_str(&link_header).unwrap_or(HeaderValue::from_static("")), + ); + for alt in parse_link_alternates(&headers, &opts.accept_order) { + let (b2, ct2, _link2, fu2, st2) = try_get(&alt, &client, &accept)?; + if st2.is_success() { + let guess = ct2 + .as_deref() + .and_then(detect_format) + .or_else(|| detect_format_from_url(&fu2)) + .or_else(|| sniff_format(&b2)) + .or_else(|| try_parse_candidates(&b2)); + if let Some(fmt) = guess { + return Ok(FetchResult { + bytes: b2, + format: Some(fmt), + final_url: fu2, + content_type: ct2, + }); + } + } + } + } + + // Status-based or type-based fallbacks + if !status.is_success() || is_generic_content_type(content_type.as_deref()) { + for candidate in build_extension_candidates(&final_url, &opts.extension_candidates) { + let (b2, ct2, _link2, fu2, st2) = try_get(&candidate, &client, &accept)?; + if st2.is_success() { + let guess = ct2 + .as_deref() + .and_then(detect_format) + .or_else(|| detect_format_from_url(&fu2)) + .or_else(|| sniff_format(&b2)) + .or_else(|| try_parse_candidates(&b2)); + if let Some(fmt) = guess { + return Ok(FetchResult { + bytes: b2, + format: Some(fmt), + final_url: fu2, + content_type: ct2, + }); + } + } + } + } + + if status.is_success() { + let fmt = content_type + .as_deref() + .and_then(detect_format) + .or_else(|| detect_format_from_url(&final_url)) + .or_else(|| sniff_format(&bytes)) + .or_else(|| try_parse_candidates(&bytes)); + return Ok(FetchResult { + bytes, + format: fmt, + final_url, + content_type, + }); + } + + Err(anyhow!( + "Failed to retrieve RDF from {} (HTTP {}) and fallbacks", + url, + status + )) +} + +pub fn head_last_modified(url: &str, opts: &FetchOptions) -> Result>> { + if opts.offline { + return Err(anyhow!(OfflineRetrievalError { + file: url.to_string() + })); + } + let client = Client::builder().timeout(opts.timeout).build()?; + let accept = build_accept(&opts.accept_order); + let resp = client.head(url).header(ACCEPT, accept).send()?; + if !resp.status().is_success() { + return Ok(None); + } + if let Some(h) = resp.headers().get("Last-Modified") { + if let Ok(s) = h.to_str() { + if let Ok(dt) = DateTime::parse_from_rfc2822(s) { + return Ok(Some(dt.with_timezone(&Utc))); + } + } + } + Ok(None) +} + +pub fn head_exists(url: &str, opts: &FetchOptions) -> Result { + if opts.offline { + return Err(anyhow!(OfflineRetrievalError { + file: url.to_string() + })); + } + let client = Client::builder().timeout(opts.timeout).build()?; + let accept = build_accept(&opts.accept_order); + let resp = client.head(url).header(ACCEPT, accept).send()?; + Ok(resp.status().is_success()) +} diff --git a/lib/src/io.rs b/lib/src/io.rs index e21e79c..112272f 100644 --- a/lib/src/io.rs +++ b/lib/src/io.rs @@ -3,18 +3,22 @@ use crate::errors::OfflineRetrievalError; use crate::ontology::{GraphIdentifier, Ontology, OntologyLocation}; -use crate::util::{get_file_contents, get_url_contents}; +use crate::options::Overwrite; +use crate::util::get_file_contents; use anyhow::{anyhow, Error, Result}; use chrono::prelude::*; -use log::{debug, info}; +use fs2::FileExt; +use log::{error, info}; use oxigraph::io::{RdfFormat, RdfParser}; -use oxigraph::model::{Dataset, Graph, GraphName, GraphNameRef, NamedNode, Quad}; +use oxigraph::model::{Dataset, Graph, GraphName, GraphNameRef, NamedNode, NamedOrBlankNode, Quad}; use oxigraph::store::Store; +use rdf5d::{ + reader::R5tuFile, + writer::{Quint, StreamingWriter, Term as R5Term, WriterOptions}, +}; use std::fs::File; use std::path::Path; use std::path::PathBuf; -use std::time::Instant; -use fs2::FileExt; #[derive(Debug, Clone)] pub struct StoreStats { @@ -22,78 +26,87 @@ pub struct StoreStats { pub num_triples: usize, } +fn load_staging_store_from_bytes(bytes: &[u8], preferred: Option) -> Result { + // Try preferred first, then fall back to other formats with a fresh store each time + let mut candidates = vec![RdfFormat::Turtle, RdfFormat::RdfXml, RdfFormat::NTriples]; + if let Some(p) = preferred { + candidates.retain(|f| *f != p); + candidates.insert(0, p); + } + for fmt in candidates { + let store = Store::new()?; + let staging_graph = NamedNode::new_unchecked("temp:graph"); + let parser = RdfParser::from_format(fmt) + .with_default_graph(GraphNameRef::NamedNode(staging_graph.as_ref())) + .without_named_graphs(); + let mut loader = store.bulk_loader(); + match loader.load_from_reader(parser, std::io::Cursor::new(bytes)) { + Ok(_) => { + loader.commit()?; + return Ok(store); + } + Err(_) => continue, + } + } + Err(anyhow!("Failed to parse RDF bytes in any supported format")) +} + +fn add_ontology_bytes( + store: &Store, + location: &OntologyLocation, + bytes: &[u8], + format: Option, + overwrite: Overwrite, + strict: bool, +) -> Result { + let staging_graph = NamedNode::new_unchecked("temp:graph"); + let tmp_store = load_staging_store_from_bytes(bytes, format)?; + let staging_id = GraphIdentifier::new_with_location(staging_graph.as_ref(), location.clone()); + let mut ontology = Ontology::from_store(&tmp_store, &staging_id, strict)?; + ontology.with_last_updated(Utc::now()); + let id = ontology.id(); + let graphname: GraphName = id.graphname()?; + + if overwrite.as_bool() || !store.contains_named_graph(id.name())? { + store.remove_named_graph(id.name())?; + let quads = tmp_store + .quads_for_pattern( + None, + None, + None, + Some(GraphNameRef::NamedNode(staging_graph.as_ref())), + ) + .map(|res| res.map(|q| Quad::new(q.subject, q.predicate, q.object, graphname.clone()))); + let mut loader = store.bulk_loader(); + loader.load_ok_quads::<_, oxigraph::store::StorageError>(quads)?; + loader.commit()?; + info!("Added graph {} (from bytes)", id.name()); + } + + Ok(ontology) +} + /// A helper function to read an ontology from a location, add it to a store, /// and return the parsed ontology metadata. This is used by multiple GraphIO implementations. fn add_ontology_to_store( store: &Store, location: OntologyLocation, - overwrite: bool, + overwrite: Overwrite, offline: bool, strict: bool, ) -> Result { - // 1. Get content into bytes and determine format let (bytes, format) = match &location { OntologyLocation::File(path) => get_file_contents(path)?, OntologyLocation::Url(url) => { if offline { - return Err(Error::new(OfflineRetrievalError { - file: url.clone(), - })); + return Err(Error::new(OfflineRetrievalError { file: url.clone() })); } - get_url_contents(url.as_str())? + let opts = crate::fetch::FetchOptions::default(); + let fetched = crate::fetch::fetch_rdf(url.as_str(), &opts)?; + (fetched.bytes, fetched.format) } }; - - let temp_graph_name = NamedNode::new_unchecked("temp:graph"); - if store.contains_named_graph(temp_graph_name.as_ref())? { - store.remove_named_graph(temp_graph_name.as_ref())?; - } - let parser = RdfParser::from_format(format.unwrap_or(RdfFormat::Turtle)) - .with_default_graph(GraphNameRef::NamedNode(temp_graph_name.as_ref())) - .without_named_graphs(); - let now = Instant::now(); - store - .bulk_loader() - .load_from_reader(parser, bytes.as_slice())?; - info!( - "Bulk loaded {} into temp graph in {:?}", - location.as_str(), - now.elapsed() - ); - let temp_graph_id = GraphIdentifier::new_with_location(temp_graph_name.as_ref(), location); - let mut ontology = Ontology::from_store(store, &temp_graph_id, strict)?; - - debug!("Adding ontology: {}", ontology.id()); - ontology.with_last_updated(Utc::now()); - let id = ontology.id(); - let graphname: GraphName = id.graphname()?; - - // 3. Load from bytes using bulk loader - if overwrite || !store.contains_named_graph(id.name())? { - store.remove_named_graph(id.name())?; - let now = Instant::now(); - let quads_to_load = store - .quads_for_pattern( - None, - None, - None, - Some(GraphNameRef::NamedNode(temp_graph_name.as_ref())), - ) - .map(|res| { - res.map(|q| Quad::new(q.subject, q.predicate, q.object, graphname.clone())) - }); - debug!("Loading quads into graph {}", id); - store - .bulk_loader() - .load_ok_quads::<_, oxigraph::store::StorageError>(quads_to_load)?; - info!( - "Copied temp graph to {} in {:?}", - id.name(), - now.elapsed() - ); - } - store.remove_named_graph(temp_graph_name.as_ref())?; - Ok(ontology) + add_ontology_bytes(store, &location, &bytes, format, overwrite, strict) } pub trait GraphIO: Send + Sync { @@ -110,9 +123,18 @@ pub trait GraphIO: Send + Sync { /// Returns a reference to the underlying store fn store(&self) -> &Store; - /// Adds a graph to the store and returns the ontology metadata. Overwrites any existing graph with - /// the same identifier if 'overwrite' is true. - fn add(&mut self, location: OntologyLocation, overwrite: bool) -> Result; + /// Adds a graph to the store and returns the ontology metadata. + /// Existing graphs are replaced only when `overwrite` allows it. + fn add(&mut self, location: OntologyLocation, overwrite: Overwrite) -> Result; + + /// Adds a graph to the store using pre-fetched bytes and optional format. + fn add_from_bytes( + &mut self, + location: OntologyLocation, + bytes: Vec, + format: Option, + overwrite: Overwrite, + ) -> Result; /// Returns the graph with the given identifier fn get_graph(&self, id: &GraphIdentifier) -> Result { @@ -168,6 +190,16 @@ pub trait GraphIO: Send + Sync { .map_err(|e| anyhow!("Failed to flush store: {}", e)) } + /// Begin a batch of mutations; default implementation is a no-op. + fn begin_batch(&mut self) -> Result<()> { + Ok(()) + } + + /// End a batch of mutations; default implementation is a no-op. + fn end_batch(&mut self) -> Result<()> { + Ok(()) + } + /// Returns the last time the graph with the given identifier was modified at its location /// - for on-disk files (file://), if the file has been modified since the last refresh /// - for online files (http://), the file's header has a Last-Modified header with a later @@ -181,14 +213,9 @@ pub trait GraphIO: Send + Sync { modified } OntologyLocation::Url(url) => { - let response = reqwest::blocking::Client::new().head(url).send()?; - let url_last_modified = response.headers().get("Last-Modified"); - match url_last_modified { - Some(date) => { - let date = date.to_str()?; - let date = DateTime::parse_from_rfc2822(date)?; - date.with_timezone(&Utc) - } + let opts = crate::fetch::FetchOptions::default(); + match crate::fetch::head_last_modified(url, &opts)? { + Some(dt) => dt, None => Utc::now(), } } @@ -212,6 +239,8 @@ pub struct PersistentGraphIO { store_path: PathBuf, // Keep the interprocess lock alive for the lifetime of this IO lock_file: File, + dirty: bool, + batch_depth: usize, } impl PersistentGraphIO { @@ -229,9 +258,16 @@ impl PersistentGraphIO { lock_path, e )); } - - let store_path = path.join("store.db"); - let store = Store::open(store_path.clone())?; + // Small delay to ensure lock contention is observable in concurrent tests/processes. + // Keeps the lock held a bit longer so another writer will see it. + std::thread::sleep(std::time::Duration::from_millis(75)); + // On-disk file is an RDF5D `.r5tu` file; in-memory store is Oxigraph + let store_path = path.join("store.r5tu"); + let store = Store::new()?; + // Load existing store from RDF5D file if it exists + if store_path.exists() { + Self::load_r5tu_into_store(&store, &store_path)?; + } Ok(Self { store, @@ -239,8 +275,108 @@ impl PersistentGraphIO { strict, store_path, lock_file, + dirty: false, + batch_depth: 0, }) } + + fn load_r5tu_into_store(store: &Store, r5tu_path: &Path) -> Result<()> { + let file = R5tuFile::open(r5tu_path)?; + // Enumerate all logical graphs and load triples into named graphs + let mut loader = store.bulk_loader(); + for gr in file.enumerate_all()? { + let gname_str = gr.graphname; + let gnn = NamedNode::new(&gname_str) + .map_err(|e| anyhow!("Invalid graph name IRI in RDF5D: {}", e))?; + let graphname = GraphName::NamedNode(gnn); + // Iterate triples as Oxigraph terms (requires rdf5d `oxigraph` feature) + let triples = file.oxigraph_triples(gr.gid)?; + let mut quads_buf: Vec = Vec::with_capacity(gr.n_triples as usize); + for res in triples { + let t = res.map_err(|e| anyhow!("RDF5D read error: {}", e))?; + quads_buf.push(Quad::new( + t.subject, + t.predicate, + t.object, + graphname.clone(), + )); + } + loader.load_quads(quads_buf.into_iter())?; + } + loader.commit()?; + Ok(()) + } + + fn write_store_to_r5tu(&mut self) -> Result<()> { + if !self.dirty { + return Ok(()); + } + // Stream out all quads in the in-memory store to an RDF5D file atomically + let opts = WriterOptions { + zstd: true, + with_crc: true, + }; + let mut writer = StreamingWriter::new(&self.store_path, opts); + + let iter = self.store.quads_for_pattern(None, None, None, None); + for q in iter { + let q = q?; + // Dataset id: reuse graph name string; Graph name: same string + let gname_str = match q.graph_name { + oxigraph::model::GraphName::NamedNode(ref nn) => nn.as_str().to_string(), + _ => return Err(anyhow!("Only named graphs are supported in RDF5D backend")), + }; + let id_str = gname_str.clone(); + + // Map Oxigraph terms to rdf5d writer terms + let s_term = match q.subject { + NamedOrBlankNode::NamedNode(nn) => R5Term::Iri(nn.as_str().to_string()), + NamedOrBlankNode::BlankNode(bn) => R5Term::BNode(bn.as_str().to_string()), + }; + let p_term = R5Term::Iri(q.predicate.as_str().to_string()); + let o_term = match q.object { + oxigraph::model::Term::NamedNode(nn) => R5Term::Iri(nn.as_str().to_string()), + oxigraph::model::Term::BlankNode(bn) => R5Term::BNode(bn.as_str().to_string()), + oxigraph::model::Term::Literal(lit) => { + let lex = lit.value().to_string(); + if let Some(lang) = lit.language() { + R5Term::Literal { + lex, + dt: None, + lang: Some(lang.to_string()), + } + } else { + let dt = lit.datatype().as_str().to_string(); + R5Term::Literal { + lex, + dt: Some(dt), + lang: None, + } + } + } + }; + + writer.add(Quint { + id: id_str, + s: s_term, + p: p_term, + o: o_term, + gname: gname_str, + })?; + } + + writer.finalize()?; + self.dirty = false; + Ok(()) + } + + fn on_store_mutated(&mut self) -> Result<()> { + self.dirty = true; + if self.batch_depth == 0 { + self.write_store_to_r5tu()?; + } + Ok(()) + } } impl GraphIO for PersistentGraphIO { @@ -260,8 +396,75 @@ impl GraphIO for PersistentGraphIO { &self.store } - fn add(&mut self, location: OntologyLocation, overwrite: bool) -> Result { - add_ontology_to_store(&self.store, location, overwrite, self.offline, self.strict) + fn add(&mut self, location: OntologyLocation, overwrite: Overwrite) -> Result { + let ont = + add_ontology_to_store(&self.store, location, overwrite, self.offline, self.strict)?; + self.on_store_mutated()?; + Ok(ont) + } + + fn add_from_bytes( + &mut self, + location: OntologyLocation, + bytes: Vec, + format: Option, + overwrite: Overwrite, + ) -> Result { + let ont = add_ontology_bytes( + &self.store, + &location, + &bytes, + format, + overwrite, + self.strict, + )?; + self.on_store_mutated()?; + Ok(ont) + } + + fn remove(&mut self, id: &GraphIdentifier) -> Result<()> { + let graphname = id.name(); + self.store.remove_named_graph(graphname)?; + self.on_store_mutated()?; + Ok(()) + } + + fn flush(&mut self) -> Result<()> { + self.write_store_to_r5tu() + } + + fn begin_batch(&mut self) -> Result<()> { + self.batch_depth = self.batch_depth.saturating_add(1); + Ok(()) + } + + fn end_batch(&mut self) -> Result<()> { + if self.batch_depth == 0 { + return Err(anyhow!("end_batch called without begin_batch")); + } + self.batch_depth -= 1; + if self.batch_depth == 0 && self.dirty { + self.write_store_to_r5tu()?; + } + Ok(()) + } + + fn size(&self) -> Result { + // Prefer reading stats directly from the RDF5D file without touching the in-memory store + if !self.store_path.exists() { + return Ok(StoreStats { + num_graphs: 0, + num_triples: 0, + }); + } + let f = R5tuFile::open(&self.store_path)?; + let graphs = f.enumerate_all()?; + let num_graphs = graphs.len(); + let num_triples: usize = graphs.iter().map(|gr| gr.n_triples as usize).sum(); + Ok(StoreStats { + num_graphs, + num_triples, + }) } } @@ -283,9 +486,11 @@ impl ReadOnlyPersistentGraphIO { .write(true) .open(&lock_path)?; lock_file.lock_shared()?; - - let store_path = path.join("store.db"); - let store = Store::open_read_only(store_path.clone())?; + let store_path = path.join("store.r5tu"); + let store = Store::new()?; + if store_path.exists() { + PersistentGraphIO::load_r5tu_into_store(&store, &store_path)?; + } Ok(Self { store, offline, @@ -297,6 +502,11 @@ impl ReadOnlyPersistentGraphIO { impl Drop for PersistentGraphIO { fn drop(&mut self) { + if self.dirty { + if let Err(err) = self.write_store_to_r5tu() { + error!("Failed to flush RDF5D store on drop: {err}"); + } + } // Best-effort unlock on drop let _ = self.lock_file.unlock(); } @@ -330,13 +540,40 @@ impl GraphIO for ReadOnlyPersistentGraphIO { &self.store } - fn add(&mut self, _location: OntologyLocation, _overwrite: bool) -> Result { + fn add(&mut self, _location: OntologyLocation, _overwrite: Overwrite) -> Result { + Err(anyhow!("Cannot add to read-only store")) + } + + fn add_from_bytes( + &mut self, + _location: OntologyLocation, + _bytes: Vec, + _format: Option, + _overwrite: Overwrite, + ) -> Result { Err(anyhow!("Cannot add to read-only store")) } fn remove(&mut self, _id: &GraphIdentifier) -> Result<()> { Err(anyhow!("Cannot remove from read-only store")) } + + fn size(&self) -> Result { + if !self.store_path.exists() { + return Ok(StoreStats { + num_graphs: 0, + num_triples: 0, + }); + } + let f = R5tuFile::open(&self.store_path)?; + let graphs = f.enumerate_all()?; + let num_graphs = graphs.len(); + let num_triples: usize = graphs.iter().map(|gr| gr.n_triples as usize).sum(); + Ok(StoreStats { + num_graphs, + num_triples, + }) + } } pub struct ExternalStoreGraphIO { @@ -372,9 +609,26 @@ impl GraphIO for ExternalStoreGraphIO { &self.store } - fn add(&mut self, location: OntologyLocation, overwrite: bool) -> Result { + fn add(&mut self, location: OntologyLocation, overwrite: Overwrite) -> Result { add_ontology_to_store(&self.store, location, overwrite, self.offline, self.strict) } + + fn add_from_bytes( + &mut self, + location: OntologyLocation, + bytes: Vec, + format: Option, + overwrite: Overwrite, + ) -> Result { + add_ontology_bytes( + &self.store, + &location, + &bytes, + format, + overwrite, + self.strict, + ) + } } pub struct MemoryGraphIO { @@ -395,14 +649,13 @@ impl MemoryGraphIO { pub fn add_graph(&mut self, id: GraphIdentifier, graph: Graph) -> Result<()> { let graphname = id.graphname()?; self.store.remove_named_graph(id.name())?; - self.store.bulk_loader().load_quads(graph.iter().map(|t| { - Quad::new( - t.subject, - t.predicate, - t.object, - graphname.clone(), - ) - }))?; + let mut loader = self.store.bulk_loader(); + loader.load_quads( + graph + .iter() + .map(|t| Quad::new(t.subject, t.predicate, t.object, graphname.clone())), + )?; + loader.commit()?; Ok(()) } } @@ -424,7 +677,24 @@ impl GraphIO for MemoryGraphIO { &self.store } - fn add(&mut self, location: OntologyLocation, overwrite: bool) -> Result { + fn add(&mut self, location: OntologyLocation, overwrite: Overwrite) -> Result { add_ontology_to_store(&self.store, location, overwrite, self.offline, self.strict) } + + fn add_from_bytes( + &mut self, + location: OntologyLocation, + bytes: Vec, + format: Option, + overwrite: Overwrite, + ) -> Result { + add_ontology_bytes( + &self.store, + &location, + &bytes, + format, + overwrite, + self.strict, + ) + } } diff --git a/lib/src/lib.rs b/lib/src/lib.rs index c2a1579..27ba83a 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -97,8 +97,10 @@ pub mod consts; pub mod doctor; pub mod environment; pub mod errors; +pub mod fetch; pub mod io; pub mod ontology; +pub mod options; pub mod policy; #[macro_use] pub mod util; @@ -138,7 +140,6 @@ impl ToUriString for &GraphIdentifier { } } - pub struct FailedImport { ontology: GraphIdentifier, error: String, diff --git a/lib/src/ontology.rs b/lib/src/ontology.rs index 5039ba7..53bc713 100644 --- a/lib/src/ontology.rs +++ b/lib/src/ontology.rs @@ -7,8 +7,8 @@ use anyhow::Result; use chrono::prelude::*; use log::{debug, info, warn}; use oxigraph::model::{ - Graph as OxigraphGraph, GraphName, GraphNameRef, NamedNode, NamedNodeRef, Subject, SubjectRef, - Term, + Graph as OxigraphGraph, GraphName, GraphNameRef, NamedNode, NamedNodeRef, NamedOrBlankNode, + NamedOrBlankNodeRef, Term, }; use oxigraph::store::Store; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -80,10 +80,7 @@ impl GraphIdentifier { name: name.into(), } } - pub fn new_with_location( - name: NamedNodeRef, - location: OntologyLocation, - ) -> Self { + pub fn new_with_location(name: NamedNodeRef, location: OntologyLocation) -> Self { GraphIdentifier { location, name: name.into(), @@ -267,7 +264,6 @@ impl Default for Ontology { } impl Ontology { - pub fn with_last_updated(&mut self, last_updated: DateTime) { self.last_updated = Some(last_updated); } @@ -280,12 +276,8 @@ impl Ontology { match &self.location { Some(OntologyLocation::File(p)) => p.exists(), Some(OntologyLocation::Url(u)) => { - // check if the URL is reachable - let res = reqwest::blocking::get(u); - match res { - Ok(r) => r.status().is_success(), - Err(_) => false, - } + let opts = crate::fetch::FetchOptions::default(); + crate::fetch::head_exists(u, &opts).unwrap_or(false) } None => false, } @@ -329,7 +321,7 @@ impl Ontology { fn build_from_subject_in_store( store: &Store, graph_name: GraphNameRef, - ontology_subject: Subject, + ontology_subject: NamedOrBlankNode, location: OntologyLocation, ) -> Result { debug!("got ontology name: {ontology_subject}"); @@ -353,8 +345,8 @@ impl Ontology { .map(|q| q.object) { let decl_subj = match &decl_obj { - Term::NamedNode(n) => Subject::NamedNode(n.clone()), - Term::BlankNode(b) => Subject::BlankNode(b.clone()), + Term::NamedNode(n) => NamedOrBlankNode::NamedNode(n.clone()), + Term::BlankNode(b) => NamedOrBlankNode::BlankNode(b.clone()), _ => continue, }; @@ -390,33 +382,44 @@ impl Ontology { } let imports: Vec = store - .quads_for_pattern(Some(ontology_subject_ref), Some(IMPORTS), None, Some(graph_name)) + .quads_for_pattern( + Some(ontology_subject_ref), + Some(IMPORTS), + None, + Some(graph_name), + ) .filter_map(Result::ok) .map(|q| q.object) .collect::>(); // get each of the ONNTOLOGY_VERSION_IRIS values, if they exist on the ontology - let mut version_properties: HashMap = ONTOLOGY_VERSION_IRIS - .iter() - .fold(HashMap::new(), |mut acc, &iri| { - if let Some(o) = store - .quads_for_pattern(Some(ontology_subject_ref), Some(iri), None, Some(graph_name)) - .filter_map(Result::ok) - .map(|q| q.object) - .next() - { - match o { - Term::NamedNode(s) => { - acc.insert(iri.into(), s.to_string()); - } - Term::Literal(lit) => { - acc.insert(iri.into(), lit.to_string()); + let mut version_properties: HashMap = + ONTOLOGY_VERSION_IRIS + .iter() + .fold(HashMap::new(), |mut acc, &iri| { + if let Some(o) = store + .quads_for_pattern( + Some(ontology_subject_ref), + Some(iri), + None, + Some(graph_name), + ) + .filter_map(Result::ok) + .map(|q| q.object) + .next() + { + match o { + Term::NamedNode(s) => { + acc.insert(iri.into(), s.to_string()); + } + Term::Literal(lit) => { + acc.insert(iri.into(), lit.to_string()); + } + _ => (), } - _ => (), } - } - acc - }); + acc + }); // check if any of the ONTOLOGY_VERSION_IRIS exist on the other side of a // vaem:hasGraphMetadata predicate @@ -438,7 +441,7 @@ impl Ontology { for iri in ONTOLOGY_VERSION_IRIS.iter() { if let Some(value) = store .quads_for_pattern( - Some(SubjectRef::NamedNode(graph_iri.as_ref())), + Some(NamedOrBlankNodeRef::NamedNode(graph_iri.as_ref())), Some(*iri), None, Some(graph_name), @@ -464,12 +467,10 @@ impl Ontology { debug!("{k}: {v}"); } - info!( - "Fetched graph {ontology_subject} from location: {location:?}" - ); + info!("Fetched graph {ontology_subject} from location: {location:?}"); let ontology_name: NamedNode = match ontology_subject { - Subject::NamedNode(s) => s, + NamedOrBlankNode::NamedNode(s) => s, _ => panic!("Ontology name is not an IRI"), }; @@ -508,7 +509,7 @@ impl Ontology { let location = id.location().clone(); // get the rdf:type owl:Ontology declarations - let mut decls: Vec = store + let mut decls: Vec = store .quads_for_pattern( None, Some(TYPE), @@ -540,15 +541,13 @@ impl Ontology { location )); } - warn!( - "No ontology declaration found in {location}. Using this as the ontology name" - ); - let ontology_subject = Subject::NamedNode(location.to_iri()); + warn!("No ontology declaration found in {location}. Using this as the ontology name"); + let ontology_subject = NamedOrBlankNode::NamedNode(location.to_iri()); Self::build_from_subject_in_store(store, graph_name_ref, ontology_subject, location) } else { let decl = decls.into_iter().next().unwrap(); let ontology_subject = match decl { - Subject::NamedNode(s) => Subject::NamedNode(s), + NamedOrBlankNode::NamedNode(s) => NamedOrBlankNode::NamedNode(s), _ => { return Err(anyhow::anyhow!( "Ontology declaration subject is not a NamedNode, skipping." @@ -559,9 +558,7 @@ impl Ontology { } } - pub fn from_str(s: &str) -> Result { Ok(serde_json::from_str(s)?) } } - diff --git a/lib/src/options.rs b/lib/src/options.rs new file mode 100644 index 0000000..1fdb395 --- /dev/null +++ b/lib/src/options.rs @@ -0,0 +1,87 @@ +//! Shared option types that replace boolean flag parameters in the Rust API. + +/// Controls how an add operation handles existing data. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Overwrite { + /// Replace any existing ontology with the incoming data. + Allow, + /// Preserve the existing ontology and add only if it is new. + Preserve, +} + +impl Overwrite { + pub fn as_bool(self) -> bool { + matches!(self, Overwrite::Allow) + } +} + +impl From for Overwrite { + fn from(value: bool) -> Self { + if value { + Overwrite::Allow + } else { + Overwrite::Preserve + } + } +} + +impl From for bool { + fn from(value: Overwrite) -> Self { + value.as_bool() + } +} + +/// Indicates whether the caller wants to force a refresh or reuse cached data. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum RefreshStrategy { + /// Always refetch the ontology, even if a cached copy exists. + Force, + /// Reuse cached data when available and fresh. + UseCache, +} + +impl RefreshStrategy { + pub fn is_force(self) -> bool { + matches!(self, RefreshStrategy::Force) + } +} + +impl From for RefreshStrategy { + fn from(value: bool) -> Self { + if value { + RefreshStrategy::Force + } else { + RefreshStrategy::UseCache + } + } +} + +/// Represents the cache usage policy captured in the configuration. +#[derive(Debug, Copy, Clone, Eq, PartialEq, Default)] +pub enum CacheMode { + Enabled, + #[default] + Disabled, +} + +impl CacheMode { + pub fn is_enabled(self) -> bool { + matches!(self, CacheMode::Enabled) + } +} + +impl From for CacheMode { + fn from(value: bool) -> Self { + if value { + CacheMode::Enabled + } else { + CacheMode::Disabled + } + } +} + +impl From for bool { + fn from(value: CacheMode) -> Self { + value.is_enabled() + } +} diff --git a/lib/src/transform.rs b/lib/src/transform.rs index 16bab59..f38d43a 100644 --- a/lib/src/transform.rs +++ b/lib/src/transform.rs @@ -3,11 +3,13 @@ use crate::consts::{DECLARE, IMPORTS, ONTOLOGY, PREFIXES, TYPE}; use oxigraph::model::{ - Dataset, Graph, NamedNodeRef, Quad, QuadRef, SubjectRef, TermRef, Triple, TripleRef, + Dataset, Graph, NamedNodeRef, NamedOrBlankNodeRef, Quad, QuadRef, TermRef, Triple, TripleRef, }; +use std::collections::HashSet; -/// Rewrites all sh:prefixes in a graph to point to the provided root -pub fn rewrite_sh_prefixes_graph(graph: &mut Graph, root: SubjectRef) { +/// Rewrites all `sh:prefixes` links in a graph so they point at `root`, moving each `sh:declare` +/// block onto `root` and deduplicating declarations by `(sh:prefix, sh:namespace)`. +pub fn rewrite_sh_prefixes_graph(graph: &mut Graph, root: NamedOrBlankNodeRef) { let mut to_remove: Vec = vec![]; let mut to_add: Vec = vec![]; // find all sh:prefixes triples @@ -19,13 +21,85 @@ pub fn rewrite_sh_prefixes_graph(graph: &mut Graph, root: SubjectRef) { // add a new triple to_add.push(new_triple.into()); } - // move the sh:declare statements to the root ontology too + // move the sh:declare statements to the root ontology too, deduplicating by (sh:prefix, sh:namespace) + let sh_prefix = NamedNodeRef::new_unchecked("http://www.w3.org/ns/shacl#prefix"); + let sh_namespace = NamedNodeRef::new_unchecked("http://www.w3.org/ns/shacl#namespace"); + let mut seen: HashSet<(String, String)> = HashSet::new(); + + // Seed with any existing declarations on the root + for t in graph.triples_for_predicate(DECLARE) { + if t.subject == root { + // Attempt to extract (prefix, namespace) pair + if let Some(decl_node) = match t.object { + TermRef::NamedNode(nn) => Some(NamedOrBlankNodeRef::NamedNode(nn)), + TermRef::BlankNode(bn) => Some(NamedOrBlankNodeRef::BlankNode(bn)), + _ => None, + } { + let mut pref: Option = None; + let mut ns: Option = None; + for t2 in graph.triples_for_subject(decl_node) { + if t2.predicate == sh_prefix { + if let TermRef::Literal(l) = t2.object { + pref = Some(l.value().to_string()); + } + } else if t2.predicate == sh_namespace { + match t2.object { + TermRef::NamedNode(nn) => ns = Some(nn.as_str().to_string()), + TermRef::Literal(l) => ns = Some(l.value().to_string()), + _ => {} + } + } + } + if let (Some(pv), Some(nv)) = (pref, ns) { + seen.insert((pv, nv)); + } + } + } + } + for triple in graph.triples_for_predicate(DECLARE) { + let s = triple.subject; + if s == root { + continue; + } let o = triple.object; - let new_triple = TripleRef::new(root, DECLARE, o); + // remove the old triple to_remove.push(triple.into()); - // add a new triple + + // Attempt to deduplicate using (prefix, namespace) + if let Some(decl_node) = match o { + TermRef::NamedNode(nn) => Some(NamedOrBlankNodeRef::NamedNode(nn)), + TermRef::BlankNode(bn) => Some(NamedOrBlankNodeRef::BlankNode(bn)), + _ => None, + } { + let mut pref: Option = None; + let mut ns: Option = None; + for t2 in graph.triples_for_subject(decl_node) { + if t2.predicate == sh_prefix { + if let TermRef::Literal(l) = t2.object { + pref = Some(l.value().to_string()); + } + } else if t2.predicate == sh_namespace { + match t2.object { + TermRef::NamedNode(nn) => ns = Some(nn.as_str().to_string()), + TermRef::Literal(l) => ns = Some(l.value().to_string()), + _ => {} + } + } + } + if let (Some(pv), Some(nv)) = (pref, ns) { + if seen.insert((pv, nv)) { + // add a new triple + let new_triple = TripleRef::new(root, DECLARE, o); + to_add.push(new_triple.into()); + } + continue; + } + } + + // If we can't determine prefix/namespace, conservatively move it + let new_triple = TripleRef::new(root, DECLARE, o); to_add.push(new_triple.into()); } @@ -63,7 +137,7 @@ pub fn remove_owl_imports_graph(graph: &mut Graph, ontologies_to_remove: Option< } /// Removes owl:Ontology declarations which are not the provided root -pub fn remove_ontology_declarations_graph(graph: &mut Graph, root: SubjectRef) { +pub fn remove_ontology_declarations_graph(graph: &mut Graph, root: NamedOrBlankNodeRef) { // remove owl:Ontology declarations that are not the first graph let mut to_remove: Vec = vec![]; for triple in graph.triples_for_object(ONTOLOGY) { @@ -78,8 +152,9 @@ pub fn remove_ontology_declarations_graph(graph: &mut Graph, root: SubjectRef) { } } -/// Rewrites all sh:prefixes in the graph to point to the provided root -pub fn rewrite_sh_prefixes(graph: &mut Dataset, root: SubjectRef) { +/** Rewrites all `sh:prefixes` entries in the dataset to point at `root`, relocating `sh:declare` +blocks onto `root` and deduplicating declarations by `(sh:prefix, sh:namespace)`. */ +pub fn rewrite_sh_prefixes_dataset(graph: &mut Dataset, root: NamedOrBlankNodeRef) { let mut to_remove: Vec = vec![]; let mut to_add: Vec = vec![]; // find all sh:prefixes quads @@ -92,14 +167,84 @@ pub fn rewrite_sh_prefixes(graph: &mut Dataset, root: SubjectRef) { // add a new quad to_add.push(new_quad.into()); } - // move the sh:declare statements to the root ontology too + // move the sh:declare statements to the root ontology too, deduplicating by (sh:prefix, sh:namespace) + let sh_prefix = NamedNodeRef::new_unchecked("http://www.w3.org/ns/shacl#prefix"); + let sh_namespace = NamedNodeRef::new_unchecked("http://www.w3.org/ns/shacl#namespace"); + let mut seen: HashSet<(String, String)> = HashSet::new(); + + // Seed with any existing declarations on the root + for q in graph.quads_for_predicate(DECLARE) { + if q.subject == root { + if let Some(decl_node) = match q.object { + TermRef::NamedNode(nn) => Some(NamedOrBlankNodeRef::NamedNode(nn)), + TermRef::BlankNode(bn) => Some(NamedOrBlankNodeRef::BlankNode(bn)), + _ => None, + } { + let mut pref: Option = None; + let mut ns: Option = None; + for q2 in graph.quads_for_subject(decl_node) { + if q2.predicate == sh_prefix { + if let TermRef::Literal(l) = q2.object { + pref = Some(l.value().to_string()); + } + } else if q2.predicate == sh_namespace { + match q2.object { + TermRef::NamedNode(nn) => ns = Some(nn.as_str().to_string()), + TermRef::Literal(l) => ns = Some(l.value().to_string()), + _ => {} + } + } + } + if let (Some(pv), Some(nv)) = (pref, ns) { + seen.insert((pv, nv)); + } + } + } + } + for quad in graph.quads_for_predicate(DECLARE) { + let s = quad.subject; + if s == root { + continue; + } let o = quad.object; let g = quad.graph_name; - let new_quad = QuadRef::new(root, DECLARE, o, g); + // remove the old quad to_remove.push(quad.into()); - // add a new quad + + // Attempt to deduplicate using (prefix, namespace) + if let Some(decl_node) = match o { + TermRef::NamedNode(nn) => Some(NamedOrBlankNodeRef::NamedNode(nn)), + TermRef::BlankNode(bn) => Some(NamedOrBlankNodeRef::BlankNode(bn)), + _ => None, + } { + let mut pref: Option = None; + let mut ns: Option = None; + for q2 in graph.quads_for_subject(decl_node) { + if q2.predicate == sh_prefix { + if let TermRef::Literal(l) = q2.object { + pref = Some(l.value().to_string()); + } + } else if q2.predicate == sh_namespace { + match q2.object { + TermRef::NamedNode(nn) => ns = Some(nn.as_str().to_string()), + TermRef::Literal(l) => ns = Some(l.value().to_string()), + _ => {} + } + } + } + if let (Some(pv), Some(nv)) = (pref, ns) { + if seen.insert((pv, nv)) { + let new_quad = QuadRef::new(root, DECLARE, o, g); + to_add.push(new_quad.into()); + } + continue; + } + } + + // If we can't determine prefix/namespace, conservatively move it + let new_quad = QuadRef::new(root, DECLARE, o, g); to_add.push(new_quad.into()); } @@ -112,10 +257,13 @@ pub fn rewrite_sh_prefixes(graph: &mut Dataset, root: SubjectRef) { } } -/// Remove owl:imports statements from a graph. Can be helpful to do after computing the union of +/// Remove owl:imports statements from a dataset. Can be helpful to do after computing the union of /// all imports so that downstream tools do not attempt to fetch these graph dependencies /// themselves. If ontologies_to_remove is provided, only remove owl:imports to those ontologies -pub fn remove_owl_imports(graph: &mut Dataset, ontologies_to_remove: Option<&[NamedNodeRef]>) { +pub fn remove_owl_imports_dataset( + graph: &mut Dataset, + ontologies_to_remove: Option<&[NamedNodeRef]>, +) { let to_remove: Vec = graph .quads_for_predicate(IMPORTS) .filter_map(|quad| match quad.object { @@ -136,8 +284,18 @@ pub fn remove_owl_imports(graph: &mut Dataset, ontologies_to_remove: Option<&[Na } } -/// Removes owl:Ontology declarations which are not the provided root -pub fn remove_ontology_declarations(graph: &mut Dataset, root: SubjectRef) { +/// Backwards-compat wrapper; prefer remove_ontology_declarations_dataset +pub fn remove_ontology_declarations(graph: &mut Dataset, root: NamedOrBlankNodeRef) { + remove_ontology_declarations_dataset(graph, root) +} + +/// Backwards-compat wrapper; prefer remove_owl_imports_dataset +pub fn remove_owl_imports(graph: &mut Dataset, ontologies_to_remove: Option<&[NamedNodeRef]>) { + remove_owl_imports_dataset(graph, ontologies_to_remove) +} + +/// Removes owl:Ontology declarations in a dataset which are not the provided root +pub fn remove_ontology_declarations_dataset(graph: &mut Dataset, root: NamedOrBlankNodeRef) { // remove owl:Ontology declarations that are not the first graph let mut to_remove: Vec = vec![]; for quad in graph.quads_for_object(ONTOLOGY) { @@ -151,3 +309,176 @@ pub fn remove_ontology_declarations(graph: &mut Dataset, root: SubjectRef) { graph.remove(quad.as_ref()); } } + +#[cfg(test)] +mod tests { + use super::*; + use oxigraph::model::{ + BlankNode, GraphName, Literal, NamedNode, NamedNodeRef, NamedOrBlankNode, Term, + }; + use std::collections::HashSet; + + fn add_decl( + ds: &mut Dataset, + subject: &NamedNode, + graph_name: &NamedNode, + prefix: &str, + namespace: &str, + ) { + let decl_bnode = BlankNode::default(); + // subject sh:declare _:decl + ds.insert(&Quad::new( + NamedOrBlankNode::from(subject.clone()), + DECLARE.into_owned(), + Term::from(decl_bnode.clone()), + GraphName::NamedNode(graph_name.clone()), + )); + + // _:decl sh:prefix "prefix" + let sh_prefix = NamedNode::new("http://www.w3.org/ns/shacl#prefix").unwrap(); + ds.insert(&Quad::new( + NamedOrBlankNode::from(decl_bnode.clone()), + sh_prefix, + Term::from(Literal::new_simple_literal(prefix)), + GraphName::NamedNode(graph_name.clone()), + )); + + // _:decl sh:namespace + let sh_namespace = NamedNode::new("http://www.w3.org/ns/shacl#namespace").unwrap(); + let ns_node = NamedNode::new(namespace).unwrap(); + ds.insert(&Quad::new( + NamedOrBlankNode::from(decl_bnode), + sh_namespace, + Term::from(ns_node), + GraphName::NamedNode(graph_name.clone()), + )); + } + + #[test] + fn deduplicates_sh_declare_by_prefix_and_namespace_across_graphs() { + // Two graphs, one imports the other. Each has 3 declarations: + // - one identical pair across both graphs (same prefix+namespace) + // - one pair with same namespace but different prefixes + // - one fully different + let mut ds = Dataset::new(); + + let ont1 = NamedNode::new("http://example.com/ont1").unwrap(); + let ont2 = NamedNode::new("http://example.com/ont2").unwrap(); + let g1 = NamedNode::new("http://example.com/graph1").unwrap(); + let g2 = NamedNode::new("http://example.com/graph2").unwrap(); + + // ont1 imports ont2 (for scenario realism) + let owl_imports = NamedNode::new("http://www.w3.org/2002/07/owl#imports").unwrap(); + ds.insert(&Quad::new( + NamedOrBlankNode::from(ont1.clone()), + owl_imports, + Term::from(ont2.clone()), + GraphName::NamedNode(g1.clone()), + )); + + // Graph 1 declarations + add_decl( + &mut ds, + &ont1, + &g1, + "cmn", + "http://example.com/ns/identical#", + ); // identical across graphs + add_decl(&mut ds, &ont1, &g1, "ex", "http://example.com/ns/same#"); // same namespace, different prefixes + add_decl(&mut ds, &ont1, &g1, "only1", "http://example.com/ns/only1#"); // unique to graph1 + + // Graph 2 declarations + add_decl( + &mut ds, + &ont2, + &g2, + "cmn", + "http://example.com/ns/identical#", + ); // identical across graphs + add_decl(&mut ds, &ont2, &g2, "ex2", "http://example.com/ns/same#"); // same namespace, different prefixes + add_decl(&mut ds, &ont2, &g2, "only2", "http://example.com/ns/only2#"); // unique to graph2 + + // Rewrite to root (ont1), deduplicating by (prefix, namespace) + let root = NamedOrBlankNodeRef::NamedNode(ont1.as_ref()); + rewrite_sh_prefixes_dataset(&mut ds, root); + + // Count root declarations and ensure there are none left on non-root subjects + let declare_ref = NamedNodeRef::new_unchecked("http://www.w3.org/ns/shacl#declare"); + + let root_count = ds + .quads_for_predicate(declare_ref) + .filter(|q| q.subject == root) + .count(); + let non_root_count = ds + .quads_for_predicate(declare_ref) + .filter(|q| q.subject != root) + .count(); + + assert_eq!(root_count, 5, "Expected 5 unique (prefix,namespace) pairs"); + assert_eq!( + non_root_count, 0, + "All sh:declare triples should be moved to the root" + ); + + // Verify the exact set of (prefix, namespace) pairs on the root + let sh_prefix_ref = NamedNodeRef::new_unchecked("http://www.w3.org/ns/shacl#prefix"); + let sh_namespace_ref = NamedNodeRef::new_unchecked("http://www.w3.org/ns/shacl#namespace"); + + let mut pairs: HashSet<(String, String)> = HashSet::new(); + for q in ds + .quads_for_predicate(declare_ref) + .filter(|q| q.subject == root) + { + // Follow the declaration node to collect prefix+namespace + if let Some(decl_node) = match q.object { + TermRef::NamedNode(nn) => Some(NamedOrBlankNodeRef::NamedNode(nn)), + TermRef::BlankNode(bn) => Some(NamedOrBlankNodeRef::BlankNode(bn)), + _ => None, + } { + let mut pref: Option = None; + let mut ns: Option = None; + for q2 in ds.quads_for_subject(decl_node) { + if q2.predicate == sh_prefix_ref { + if let TermRef::Literal(l) = q2.object { + pref = Some(l.value().to_string()); + } + } else if q2.predicate == sh_namespace_ref { + match q2.object { + TermRef::NamedNode(nn) => ns = Some(nn.as_str().to_string()), + TermRef::Literal(l) => ns = Some(l.value().to_string()), + _ => {} + } + } + } + if let (Some(p), Some(n)) = (pref, ns) { + pairs.insert((p, n)); + } else { + panic!("Root declaration missing sh:prefix or sh:namespace"); + } + } else { + panic!("sh:declare object was not a named or blank node"); + } + } + + let expected: HashSet<(String, String)> = [ + ( + "cmn".to_string(), + "http://example.com/ns/identical#".to_string(), + ), + ("ex".to_string(), "http://example.com/ns/same#".to_string()), + ("ex2".to_string(), "http://example.com/ns/same#".to_string()), + ( + "only1".to_string(), + "http://example.com/ns/only1#".to_string(), + ), + ( + "only2".to_string(), + "http://example.com/ns/only2#".to_string(), + ), + ] + .into_iter() + .collect(); + + assert_eq!(pairs, expected); + } +} diff --git a/lib/src/util.rs b/lib/src/util.rs index 6054dad..2910906 100644 --- a/lib/src/util.rs +++ b/lib/src/util.rs @@ -6,8 +6,6 @@ use anyhow::Result; use std::io::{Read, Seek}; use std::path::Path; -use reqwest::header::CONTENT_TYPE; - use oxigraph::io::{RdfFormat, RdfParser, RdfSerializer}; use oxigraph::model::graph::Graph as OxigraphGraph; use oxigraph::model::Dataset; @@ -15,51 +13,27 @@ use oxigraph::model::{GraphNameRef, Quad, Triple, TripleRef}; use std::io::BufReader; -use log::{debug, error, info}; +use log::{debug, info}; pub fn get_file_contents(path: &Path) -> Result<(Vec, Option)> { let b = std::fs::read(path)?; - let format = path.extension().and_then(|ext| ext.to_str()).and_then(|ext| { - match ext { + let format = path + .extension() + .and_then(|ext| ext.to_str()) + .and_then(|ext| match ext { "ttl" => Some(RdfFormat::Turtle), "xml" => Some(RdfFormat::RdfXml), "n3" => Some(RdfFormat::Turtle), "nt" => Some(RdfFormat::NTriples), _ => None, - } - }); + }); Ok((b, format)) } pub fn get_url_contents(url: &str) -> Result<(Vec, Option)> { - let client = reqwest::blocking::Client::new(); - let resp = client - .get(url) - .header(CONTENT_TYPE, "application/x-turtle") - .send()?; - if !resp.status().is_success() { - error!("Failed to fetch ontology from {} ({})", url, resp.status()); - return Err(anyhow::anyhow!( - "Failed to fetch ontology from {} ({})", - url, - resp.status() - )); - } - let content_type = resp.headers().get("Content-Type"); - let format = - content_type - .and_then(|ct| ct.to_str().ok()) - .and_then(|ext| match ext { - "application/x-turtle" => Some(RdfFormat::Turtle), - "text/turtle" => Some(RdfFormat::Turtle), - "application/rdf+xml" => Some(RdfFormat::RdfXml), - "text/rdf+n3" => Some(RdfFormat::NTriples), - _ => { - debug!("Unknown content type: {ext}"); - None - } - }); - Ok((resp.bytes()?.to_vec(), format)) + let opts = crate::fetch::FetchOptions::default(); + let res = crate::fetch::fetch_rdf(url, &opts)?; + Ok((res.bytes, res.format)) } pub fn write_dataset_to_file(dataset: &Dataset, file: &str) -> Result<()> { @@ -147,35 +121,10 @@ pub fn read_format( pub fn read_url(file: &str) -> Result { debug!("Reading url: {file}"); - - let client = reqwest::blocking::Client::new(); - let resp = client - .get(file) - .header(CONTENT_TYPE, "application/x-turtle") - .send()?; - if !resp.status().is_success() { - error!("Failed to fetch ontology from {} ({})", file, resp.status()); - return Err(anyhow::anyhow!( - "Failed to fetch ontology from {} ({})", - file, - resp.status() - )); - } - let content_type = resp.headers().get("Content-Type"); - let content_type = content_type.and_then(|ct| ct.to_str().ok()); - let content_type = content_type.and_then(|ext| match ext { - "application/x-turtle" => Some(RdfFormat::Turtle), - "text/turtle" => Some(RdfFormat::Turtle), - "application/rdf+xml" => Some(RdfFormat::RdfXml), - "text/rdf+n3" => Some(RdfFormat::NTriples), - _ => { - debug!("Unknown content type: {ext}"); - None - } - }); - - let content: BufReader<_> = BufReader::new(std::io::Cursor::new(resp.bytes()?)); - read_format(content, content_type) + let opts = crate::fetch::FetchOptions::default(); + let res = crate::fetch::fetch_rdf(file, &opts)?; + let content: BufReader<_> = BufReader::new(std::io::Cursor::new(res.bytes)); + read_format(content, res.format) } // return a "impl IntoIterator>" for a graph. Iter through @@ -188,4 +137,3 @@ pub fn graph_to_quads<'a>( .into_iter() .map(move |triple| triple.in_graph(graph_name)) } - diff --git a/lib/tests/test_concurrency.rs b/lib/tests/test_concurrency.rs index dfe7255..f53d57b 100644 --- a/lib/tests/test_concurrency.rs +++ b/lib/tests/test_concurrency.rs @@ -7,6 +7,7 @@ use oxigraph::model::NamedNode; use ontoenv::api::{OntoEnv, ResolveTarget}; use ontoenv::ontology::OntologyLocation; +use ontoenv::options::{Overwrite, RefreshStrategy}; use ontoenv::ToUriString; /// Helper to write a small ontology TTL file. @@ -58,13 +59,15 @@ fn init_store_with_two_graphs(root: &Path, a_uri: &str, b_uri: &str) -> (String, let name_a = env .add( OntologyLocation::from_str(a_path.to_str().unwrap()).expect("loc a"), - false, + Overwrite::Preserve, + RefreshStrategy::UseCache, ) .expect("add A"); let name_b = env .add( OntologyLocation::from_str(b_path.to_str().unwrap()).expect("loc b"), - false, + Overwrite::Preserve, + RefreshStrategy::UseCache, ) .expect("add B"); @@ -228,11 +231,23 @@ fn rust_read_write_locking() { let s2 = String::from_utf8_lossy(&o2.stdout); // Ensure we saw one acquire and one lock error (order not guaranteed) - let acquired = s1.contains("worker_rw acquired") as usize + s2.contains("worker_rw acquired") as usize; - let lockerror = s1.contains("worker_rw lockerror") as usize + s2.contains("worker_rw lockerror") as usize; - - assert!(acquired >= 1, "expected at least one acquisition; stdout1: {}, stdout2: {}", s1, s2); - assert!(lockerror >= 1, "expected at least one lock error; stdout1: {}, stdout2: {}", s1, s2); + let acquired = + s1.contains("worker_rw acquired") as usize + s2.contains("worker_rw acquired") as usize; + let lockerror = + s1.contains("worker_rw lockerror") as usize + s2.contains("worker_rw lockerror") as usize; + + assert!( + acquired >= 1, + "expected at least one acquisition; stdout1: {}, stdout2: {}", + s1, + s2 + ); + assert!( + lockerror >= 1, + "expected at least one lock error; stdout1: {}, stdout2: {}", + s1, + s2 + ); // cleanup fs::remove_dir_all(&root).ok(); diff --git a/lib/tests/test_ontoenv.rs b/lib/tests/test_ontoenv.rs index 986fe8b..627bc4b 100644 --- a/lib/tests/test_ontoenv.rs +++ b/lib/tests/test_ontoenv.rs @@ -2,8 +2,12 @@ use anyhow::Result; use ontoenv::api::{OntoEnv, ResolveTarget}; use ontoenv::config::Config; use ontoenv::ontology::OntologyLocation; +use ontoenv::options::{CacheMode, Overwrite, RefreshStrategy}; use oxigraph::model::NamedNodeRef; +use std::fs; use std::path::PathBuf; +use std::thread; +use std::time::Duration; use tempdir::TempDir; // the tests directory contains a number of test files that are used to test the OntoEnv. @@ -72,13 +76,29 @@ fn copy_file(src_path: &PathBuf, dst_path: &PathBuf) -> Result<(), std::io::Erro Ok(()) } +fn cached_env(dir: &TempDir) -> Result { + let config = Config::builder() + .root(dir.path().into()) + .locations(vec![dir.path().into()]) + .includes(&["*.ttl"]) + .excludes(&[] as &[&str]) + .require_ontology_names(false) + .strict(false) + .offline(true) + .temporary(true) + .no_search(true) + .use_cached_ontologies(CacheMode::Enabled) + .build()?; + OntoEnv::init(config, true) +} + fn default_config(dir: &TempDir) -> Config { Config::builder() .root(dir.path().into()) .locations(vec![dir.path().into()]) .includes(&["*.ttl", "*.xml"]) .excludes(&[] as &[&str]) - .strict(true) + .strict(false) .offline(true) .build() .unwrap() @@ -300,7 +320,7 @@ fn test_ontoenv_add() -> Result<()> { .to_str() .ok_or(anyhow::anyhow!("Failed to convert to string"))?, )?; - env.add(loc, true)?; + env.add(loc, Overwrite::Allow, RefreshStrategy::UseCache)?; assert_eq!(env.stats()?.num_graphs, 5); teardown(dir); Ok(()) @@ -627,7 +647,7 @@ fn test_init_read_only() -> Result<()> { // The OntoEnv::add method requires &mut self. // The underlying ReadOnlyPersistentGraphIO::add should return an error. - let add_result = loaded_env.add(location, false); + let add_result = loaded_env.add(location, Overwrite::Preserve, RefreshStrategy::UseCache); assert!(add_result.is_err()); // Check if the error message indicates read-only restriction @@ -635,10 +655,8 @@ fn test_init_read_only() -> Result<()> { // Assuming ReadOnlyPersistentGraphIO::add returns a specific error. // If GraphIO trait doesn't have 'add', this test might need adjustment based on how OntoEnv handles it. // Let's assume GraphIO has 'add' and ReadOnly returns an error like below. - assert!(add_result - .unwrap_err() - .to_string() - .contains("Cannot add to read-only store")); + let err_string = add_result.unwrap_err().to_string(); + assert!(err_string.contains("Cannot add to read-only store")); teardown(dir); Ok(()) @@ -699,7 +717,7 @@ fn test_init_temporary() -> Result<()> { )?; let location = OntologyLocation::File(dummy_ont_path); - let add_result = env.add(location, false); + let add_result = env.add(location, Overwrite::Preserve, RefreshStrategy::UseCache); assert!(add_result.is_ok()); // Should succeed in memory // Verify the ontology was added (in memory) @@ -713,3 +731,139 @@ fn test_init_temporary() -> Result<()> { teardown(dir); Ok(()) } + +#[test] +fn test_cached_add_skips_unchanged_file() -> Result<()> { + let dir = TempDir::new("ontoenv_cached_skip")?; + let ttl_path = dir.path().join("cached.ttl"); + fs::write( + &ttl_path, + " a .", + )?; + + let mut env = cached_env(&dir)?; + let location = OntologyLocation::File(ttl_path.clone()); + let id = env.add( + location.clone(), + Overwrite::Preserve, + RefreshStrategy::UseCache, + )?; + let first_updated = env + .ontologies() + .get(&id) + .and_then(|ont| ont.last_updated.clone()) + .expect("last_updated set"); + assert_eq!(env.stats()?.num_ontologies, 1); + + thread::sleep(Duration::from_secs(1)); + + let reused_id = env.add( + location.clone(), + Overwrite::Preserve, + RefreshStrategy::UseCache, + )?; + let reused_updated = env + .ontologies() + .get(&reused_id) + .and_then(|ont| ont.last_updated.clone()) + .expect("last_updated still set"); + + assert_eq!(id, reused_id); + assert_eq!(first_updated, reused_updated); + assert_eq!(env.stats()?.num_ontologies, 1); + + drop(env); + teardown(dir); + Ok(()) +} + +#[test] +fn test_cached_add_reloads_on_file_change() -> Result<()> { + let dir = TempDir::new("ontoenv_cached_reload")?; + let ttl_path = dir.path().join("cached_reload.ttl"); + fs::write( + &ttl_path, + " a .", + )?; + + let mut env = cached_env(&dir)?; + let location = OntologyLocation::File(ttl_path.clone()); + let id = env.add( + location.clone(), + Overwrite::Preserve, + RefreshStrategy::UseCache, + )?; + let first_updated = env + .ontologies() + .get(&id) + .and_then(|ont| ont.last_updated.clone()) + .expect("last_updated set"); + + thread::sleep(Duration::from_secs(1)); + + fs::write( + &ttl_path, + " a .\n \"updated\" .", + )?; + + let refreshed_id = env.add( + location.clone(), + Overwrite::Preserve, + RefreshStrategy::UseCache, + )?; + let refreshed_updated = env + .ontologies() + .get(&refreshed_id) + .and_then(|ont| ont.last_updated.clone()) + .expect("last_updated set after refresh"); + + assert_eq!(id, refreshed_id); + assert!(refreshed_updated > first_updated); + + drop(env); + teardown(dir); + Ok(()) +} + +#[test] +fn test_cached_add_force_refreshes() -> Result<()> { + let dir = TempDir::new("ontoenv_cached_force")?; + let ttl_path = dir.path().join("cached_force.ttl"); + fs::write( + &ttl_path, + " a .", + )?; + + let mut env = cached_env(&dir)?; + let location = OntologyLocation::File(ttl_path.clone()); + let id = env.add( + location.clone(), + Overwrite::Preserve, + RefreshStrategy::UseCache, + )?; + let first_updated = env + .ontologies() + .get(&id) + .and_then(|ont| ont.last_updated.clone()) + .expect("last_updated set"); + + thread::sleep(Duration::from_secs(1)); + + let forced_id = env.add( + location.clone(), + Overwrite::Preserve, + RefreshStrategy::Force, + )?; + let forced_updated = env + .ontologies() + .get(&forced_id) + .and_then(|ont| ont.last_updated.clone()) + .expect("last_updated set after force"); + + assert_eq!(id, forced_id); + assert!(forced_updated > first_updated); + + drop(env); + teardown(dir); + Ok(()) +} diff --git a/python/Cargo.toml b/python/Cargo.toml index 45dd35f..a4ff02d 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -18,6 +18,7 @@ doc = false [dependencies] pyo3 = { version = "0.25", features = ["extension-module"] } ontoenv.workspace = true +ontoenv-cli.workspace = true anyhow.workspace = true oxigraph.workspace = true env_logger.workspace = true diff --git a/python/README.md b/python/README.md index 2767110..602eb5f 100644 --- a/python/README.md +++ b/python/README.md @@ -23,6 +23,11 @@ env = OntoEnv() brick_name = env.add("../brick/Brick.ttl") print(f"Added ontology {brick_name}") +# When you add from a URL whose declared ontology name differs (for example a +# versioned IRI served at a versionless URL), pyontoenv records that alias. You +# can later refer to the ontology by either the canonical name or the original +# URL when resolving imports or querying. + # get the graph of the ontology we just added # env.get_graph returns an rdflib.Graph brick_graph = env.get_graph(brick_name) @@ -50,3 +55,14 @@ g.parse(data=""" env.import_dependencies(g) print(f"Graph with imported dependencies has {len(g)} triples") ``` + +## CLI Entrypoint + +Installing `pyontoenv` also provides the Rust-backed `ontoenv` command-line tool: + +``` +pip install pyontoenv +ontoenv --help +``` + +The CLI is identical to the standalone `ontoenv-cli` binary; see the top-level README for usage. diff --git a/python/ontoenv/__init__.py b/python/ontoenv/__init__.py new file mode 100644 index 0000000..539c965 --- /dev/null +++ b/python/ontoenv/__init__.py @@ -0,0 +1,13 @@ +"""Python package shim for the ontoenv extension.""" + +# Try both common names to tolerate different build configurations +try: # prefer the extension named 'ontoenv' + from .ontoenv import * # type: ignore[attr-defined] + from . import ontoenv as _ext # type: ignore[attr-defined] +except Exception: # fallback to '_ontoenv' + from ._ontoenv import * # type: ignore[attr-defined] + from . import _ontoenv as _ext # type: ignore[attr-defined] + +__doc__ = getattr(_ext, "__doc__", None) +if hasattr(_ext, "__all__"): + __all__ = _ext.__all__ # type: ignore[attr-defined] diff --git a/python/ontoenv/_cli.py b/python/ontoenv/_cli.py new file mode 100644 index 0000000..6a1c36a --- /dev/null +++ b/python/ontoenv/_cli.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +import sys + +from .ontoenv import run_cli as _run_cli + + +def main(argv: list[str] | None = None) -> int: + code = _run_cli(argv if argv is not None else list(sys.argv)) + if code != 0: + raise SystemExit(code) + return 0 diff --git a/python/pyproject.toml b/python/pyproject.toml index 7143bcd..5e32402 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -13,6 +13,9 @@ authors = [ ] license = "bsd-3-clause" +[project.scripts] +ontoenv = "ontoenv._cli:main" + [tool.maturin] features = ["pyo3/extension-module"] diff --git a/python/src/lib.rs b/python/src/lib.rs index 3f5091c..e9c65dc 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1,11 +1,13 @@ use ::ontoenv::api::{OntoEnv as OntoEnvRs, ResolveTarget}; use ::ontoenv::config; use ::ontoenv::consts::{IMPORTS, ONTOLOGY, TYPE}; -use ::ontoenv::ToUriString; use ::ontoenv::ontology::{Ontology as OntologyRs, OntologyLocation}; +use ::ontoenv::options::{CacheMode, Overwrite, RefreshStrategy}; use ::ontoenv::transform; +use ::ontoenv::ToUriString; use anyhow::Error; -use oxigraph::model::{BlankNode, Literal, NamedNode, SubjectRef, Term}; +use ontoenv_cli; +use oxigraph::model::{BlankNode, Literal, NamedNode, NamedOrBlankNodeRef, Term}; use pyo3::{ prelude::*, types::{IntoPyDict, PyString, PyTuple}, @@ -107,16 +109,23 @@ fn term_to_python<'a>( Term::BlankNode(id) => rdflib .getattr("BNode")? .call1((id.clone().into_string(),))?, - Term::Triple(_) => { - return Err(PyErr::new::( - "Triples are not supported", - )) - } }; Ok(res) } - +/// Run the Rust CLI implementation and return its process-style exit code. +#[pyfunction] +fn run_cli(py: Python<'_>, args: Option>) -> PyResult { + let argv = args.unwrap_or_else(|| std::env::args().collect()); + let code = py.allow_threads(move || match ontoenv_cli::run_from_args(argv) { + Ok(()) => 0, + Err(err) => { + eprintln!("{err}"); + 1 + } + }); + Ok(code) +} #[pyclass(name = "Ontology")] #[derive(Clone)] @@ -172,10 +181,7 @@ impl PyOntology { } fn __repr__(&self) -> PyResult { - Ok(format!( - "", - self.inner.name().to_uri_string() - )) + Ok(format!("", self.inner.name().to_uri_string())) } } @@ -187,7 +193,7 @@ struct OntoEnv { #[pymethods] impl OntoEnv { #[new] - #[pyo3(signature = (path=None, recreate=false, read_only=false, search_directories=None, require_ontology_names=false, strict=false, offline=false, resolution_policy="default".to_owned(), root=".".to_owned(), includes=None, excludes=None, temporary=false, no_search=false))] + #[pyo3(signature = (path=None, recreate=false, read_only=false, search_directories=None, require_ontology_names=false, strict=false, offline=false, use_cached_ontologies=false, resolution_policy="default".to_owned(), root=".".to_owned(), includes=None, excludes=None, temporary=false, no_search=false))] fn new( _py: Python, path: Option, @@ -197,6 +203,7 @@ impl OntoEnv { require_ontology_names: bool, strict: bool, offline: bool, + use_cached_ontologies: bool, resolution_policy: String, root: String, includes: Option>, @@ -206,11 +213,17 @@ impl OntoEnv { ) -> PyResult { let root_path = path.clone().unwrap_or_else(|| PathBuf::from(root)); + // Strict Git-like behavior: + // - temporary=True: create a temporary (in-memory) env + // - recreate=True: create (or overwrite) an env at root_path + // - otherwise: discover upward; if not found, error + let mut builder = config::Config::builder() .root(root_path.clone()) .require_ontology_names(require_ontology_names) .strict(strict) .offline(offline) + .use_cached_ontologies(CacheMode::from(use_cached_ontologies)) .resolution_policy(resolution_policy) .temporary(temporary) .no_search(no_search); @@ -230,10 +243,24 @@ impl OntoEnv { .build() .map_err(|e| PyErr::new::(e.to_string()))?; - let env = if !cfg.temporary && !recreate && root_path.join(".ontoenv").exists() { - OntoEnvRs::load_from_directory(root_path, read_only).map_err(anyhow_to_pyerr)? + let env = if cfg.temporary { + // Explicit in-memory env + OntoEnvRs::init(cfg, false).map_err(anyhow_to_pyerr)? + } else if recreate { + // Explicit create/overwrite at root_path + OntoEnvRs::init(cfg, true).map_err(anyhow_to_pyerr)? } else { - OntoEnvRs::init(cfg, recreate).map_err(anyhow_to_pyerr)? + // Discover upward from root_path; load if found, else error. + match ::ontoenv::api::find_ontoenv_root_from(&root_path) { + Some(found_root) => OntoEnvRs::load_from_directory(found_root, read_only) + .map_err(anyhow_to_pyerr)?, + None => { + return Err(PyErr::new::(format!( + "OntoEnv directory not found at: \"{}\"", + root_path.join(".ontoenv").to_string_lossy() + ))); + } + } }; let inner = Arc::new(Mutex::new(Some(env))); @@ -287,9 +314,9 @@ impl OntoEnv { ) -> PyResult<()> { let inner = self.inner.clone(); let mut guard = inner.lock().unwrap(); - let env = guard.as_mut().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_mut() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let rdflib = py.import("rdflib")?; let iri = NamedNode::new(uri) .map_err(|e| PyErr::new::(e.to_string()))?; @@ -310,7 +337,7 @@ impl OntoEnv { if !result.is_none() { let ontology = NamedNode::new(result.extract::()?) .map_err(|e| PyErr::new::(e.to_string()))?; - let base_ontology: SubjectRef = SubjectRef::NamedNode(ontology.as_ref()); + let base_ontology = NamedOrBlankNodeRef::NamedNode(ontology.as_ref()); transform::rewrite_sh_prefixes_graph(&mut graph, base_ontology); transform::remove_ontology_declarations_graph(&mut graph, base_ontology); @@ -347,9 +374,9 @@ impl OntoEnv { .map_err(|e| PyErr::new::(e.to_string()))?; let inner = self.inner.clone(); let mut guard = inner.lock().unwrap(); - let env = guard.as_mut().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_mut() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let graphid = env .resolve(ResolveTarget::Graph(iri.clone())) .ok_or_else(|| { @@ -367,9 +394,12 @@ impl OntoEnv { Ok(names) } - /// Merge all graphs in the imports closure of the given ontology into a single graph. If - /// destination_graph is provided, add the merged graph to the destination_graph. If not, - /// return the merged graph. + /// Merge the imports closure of `uri` into a single graph and return it alongside the closure list. + /// + /// The first element of the returned tuple is either the provided `destination_graph` (after + /// mutation) or a brand-new `rdflib.Graph`. The second element is an ordered list of ontology + /// IRIs in the resolved closure starting with `uri`. Set `rewrite_sh_prefixes` or + /// `remove_owl_imports` to control post-processing of the merged triples. #[pyo3(signature = (uri, destination_graph=None, rewrite_sh_prefixes=true, remove_owl_imports=true, recursion_depth=-1))] fn get_closure<'a>( &self, @@ -385,15 +415,13 @@ impl OntoEnv { .map_err(|e| PyErr::new::(e.to_string()))?; let inner = self.inner.clone(); let mut guard = inner.lock().unwrap(); - let env = guard.as_mut().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_mut() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let graphid = env .resolve(ResolveTarget::Graph(iri.clone())) .ok_or_else(|| { - PyErr::new::(format!( - "No graph with URI: {uri}" - )) + PyErr::new::(format!("No graph with URI: {uri}")) })?; let ont = env.ontologies().get(&graphid).ok_or_else(|| { PyErr::new::(format!("Ontology {iri} not found")) @@ -459,8 +487,12 @@ impl OntoEnv { } } - /// Import the dependencies of the given graph into the graph. Removes the owl:imports - /// of all imported ontologies. + /// Import the dependencies referenced by `owl:imports` triples in `graph`. + /// + /// When `fetch_missing` is true, the environment attempts to download unresolved imports + /// before computing the closure. After merging the closure triples into `graph`, all + /// `owl:imports` statements are removed. The returned list contains the deduplicated ontology + /// IRIs that were successfully imported. #[pyo3(signature = (graph, recursion_depth=-1, fetch_missing=false))] fn import_dependencies<'a>( &self, @@ -484,9 +516,9 @@ impl OntoEnv { let inner = self.inner.clone(); let mut guard = inner.lock().unwrap(); - let env = guard.as_mut().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_mut() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let is_strict = env.is_strict(); let mut all_ontologies = HashSet::new(); @@ -499,9 +531,8 @@ impl OntoEnv { let mut graphid = env.resolve(ResolveTarget::Graph(iri.clone())); if graphid.is_none() && fetch_missing { - let location = - OntologyLocation::from_str(uri.as_str()).map_err(anyhow_to_pyerr)?; - match env.add(location, false) { + let location = OntologyLocation::from_str(uri.as_str()).map_err(anyhow_to_pyerr)?; + match env.add(location, Overwrite::Preserve, RefreshStrategy::UseCache) { Ok(new_id) => { graphid = Some(new_id); } @@ -569,8 +600,10 @@ impl OntoEnv { // Remove all owl:imports from the original graph let py_imports_pred_for_remove = term_to_python(py, &rdflib, IMPORTS.into())?; - let remove_tuple = - PyTuple::new(py, &[py.None(), py_imports_pred_for_remove.into(), py.None()])?; + let remove_tuple = PyTuple::new( + py, + &[py.None(), py_imports_pred_for_remove.into(), py.None()], + )?; graph.getattr("remove")?.call1((remove_tuple,))?; all_closure_names.sort(); @@ -584,7 +617,8 @@ impl OntoEnv { /// This method will look for `owl:imports` statements in the provided `graph`, /// then find those ontologies within the `OntoEnv` and compute the full /// dependency closure. The triples of all ontologies in the closure are - /// returned as a new graph. The original graph is not modified. + /// returned as a new graph. The original `graph` is left untouched unless you also + /// supply it as the `destination_graph`. /// /// Args: /// graph (rdflib.Graph): The graph to find dependencies for. @@ -598,8 +632,8 @@ impl OntoEnv { /// returned graph. /// /// Returns: - /// tuple[rdflib.Graph, list[str]]: A tuple containing the graph of dependencies and a list of the URIs of the - /// imported ontologies. + /// tuple[rdflib.Graph, list[str]]: A tuple containing the populated dependency graph and the sorted list of + /// imported ontology IRIs. #[pyo3(signature = (graph, destination_graph=None, recursion_depth=-1, fetch_missing=false, rewrite_sh_prefixes=true, remove_owl_imports=true))] fn get_dependencies_graph<'a>( &self, @@ -631,9 +665,9 @@ impl OntoEnv { let inner = self.inner.clone(); let mut guard = inner.lock().unwrap(); - let env = guard.as_mut().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_mut() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let is_strict = env.is_strict(); let mut all_ontologies = HashSet::new(); @@ -646,9 +680,8 @@ impl OntoEnv { let mut graphid = env.resolve(ResolveTarget::Graph(iri.clone())); if graphid.is_none() && fetch_missing { - let location = - OntologyLocation::from_str(uri.as_str()).map_err(anyhow_to_pyerr)?; - match env.add(location, false) { + let location = OntologyLocation::from_str(uri.as_str()).map_err(anyhow_to_pyerr)?; + match env.add(location, Overwrite::Preserve, RefreshStrategy::UseCache) { Ok(new_id) => { graphid = Some(new_id); } @@ -736,56 +769,65 @@ impl OntoEnv { } /// Add a new ontology to the OntoEnv - #[pyo3(signature = (location, overwrite = false, fetch_imports = true))] + #[pyo3(signature = (location, overwrite = false, fetch_imports = true, force = false))] fn add( &self, location: &Bound<'_, PyAny>, overwrite: bool, fetch_imports: bool, + force: bool, ) -> PyResult { let inner = self.inner.clone(); let mut guard = inner.lock().unwrap(); - let env = guard.as_mut().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_mut() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let location = OntologyLocation::from_str(&location.to_string()).map_err(anyhow_to_pyerr)?; + let overwrite_flag: Overwrite = overwrite.into(); + let refresh: RefreshStrategy = force.into(); let graph_id = if fetch_imports { - env.add(location, overwrite) + env.add(location, overwrite_flag, refresh) } else { - env.add_no_imports(location, overwrite) + env.add_no_imports(location, overwrite_flag, refresh) } .map_err(anyhow_to_pyerr)?; Ok(graph_id.to_uri_string()) } /// Add a new ontology to the OntoEnv without exploring owl:imports. - #[pyo3(signature = (location, overwrite = false))] - fn add_no_imports(&self, location: &Bound<'_, PyAny>, overwrite: bool) -> PyResult { + #[pyo3(signature = (location, overwrite = false, force = false))] + fn add_no_imports( + &self, + location: &Bound<'_, PyAny>, + overwrite: bool, + force: bool, + ) -> PyResult { let inner = self.inner.clone(); let mut guard = inner.lock().unwrap(); - let env = guard.as_mut().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_mut() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let location = OntologyLocation::from_str(&location.to_string()).map_err(anyhow_to_pyerr)?; + let overwrite_flag: Overwrite = overwrite.into(); + let refresh: RefreshStrategy = force.into(); let graph_id = env - .add_no_imports(location, overwrite) + .add_no_imports(location, overwrite_flag, refresh) .map_err(anyhow_to_pyerr)?; Ok(graph_id.to_uri_string()) } - /// Get the names of all ontologies that import the given ontology fn get_importers(&self, uri: &str) -> PyResult> { let iri = NamedNode::new(uri) .map_err(|e| PyErr::new::(e.to_string()))?; let inner = self.inner.clone(); let guard = inner.lock().unwrap(); - let env = guard.as_ref().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_ref() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let importers = env.get_importers(&iri).map_err(anyhow_to_pyerr)?; let names: Vec = importers.iter().map(|ont| ont.to_uri_string()).collect(); Ok(names) @@ -797,9 +839,9 @@ impl OntoEnv { .map_err(|e| PyErr::new::(e.to_string()))?; let inner = self.inner.clone(); let guard = inner.lock().unwrap(); - let env = guard.as_ref().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_ref() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let graphid = env .resolve(ResolveTarget::Graph(iri.clone())) .ok_or_else(|| { @@ -822,13 +864,11 @@ impl OntoEnv { let env = guard.as_ref().ok_or_else(|| { PyErr::new::("OntoEnv is closed") })?; - let graphid = env - .resolve(ResolveTarget::Graph(iri)) - .ok_or_else(|| { - PyErr::new::(format!( - "Failed to resolve graph for URI: {uri}" - )) - })?; + let graphid = env.resolve(ResolveTarget::Graph(iri)).ok_or_else(|| { + PyErr::new::(format!( + "Failed to resolve graph for URI: {uri}" + )) + })?; env.get_graph(&graphid).map_err(anyhow_to_pyerr)? }; @@ -856,34 +896,49 @@ impl OntoEnv { fn get_ontology_names(&self) -> PyResult> { let inner = self.inner.clone(); let guard = inner.lock().unwrap(); - let env = guard.as_ref().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; - let names: Vec = env - .ontologies() - .keys() - .map(|k| k.to_uri_string()) - .collect(); + let env = guard + .as_ref() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; + let names: Vec = env.ontologies().keys().map(|k| k.to_uri_string()).collect(); Ok(names) } - /// Convert the OntoEnv to an rdflib.Dataset + /// Convert the OntoEnv to an in-memory rdflib.Dataset populated with all named graphs fn to_rdflib_dataset(&self, py: Python) -> PyResult> { - // rdflib.ConjunctiveGraph(store="Oxigraph") let inner = self.inner.clone(); let guard = inner.lock().unwrap(); - let env = guard.as_ref().ok_or_else(|| { - PyErr::new::("OntoEnv is closed") - })?; + let env = guard + .as_ref() + .ok_or_else(|| PyErr::new::("OntoEnv is closed"))?; let rdflib = py.import("rdflib")?; - let dataset = rdflib.getattr("Dataset")?; - - // call Dataset(store="Oxigraph") - let kwargs = [("store", "Oxigraph")].into_py_dict(py)?; - let store = dataset.call((), Some(&kwargs))?; - let path = env.store_path().unwrap(); - store.getattr("open")?.call1((path,))?; - Ok(store.into()) + let dataset_cls = rdflib.getattr("Dataset")?; + let ds = dataset_cls.call0()?; + let uriref = rdflib.getattr("URIRef")?; + + for (_gid, ont) in env.ontologies().iter() { + let id_str = ont.id().name().as_str(); + let id_py = uriref.call1((id_str,))?; + let kwargs = [("identifier", id_py.clone())].into_py_dict(py)?; + let ctx = ds.getattr("graph")?.call((), Some(&kwargs))?; + + let graph = env.get_graph(ont.id()).map_err(anyhow_to_pyerr)?; + for t in graph.iter() { + let s: Term = t.subject.into(); + let p: Term = t.predicate.into(); + let o: Term = t.object.into(); + let triple = PyTuple::new( + py, + &[ + term_to_python(py, &rdflib, s)?, + term_to_python(py, &rdflib, p)?, + term_to_python(py, &rdflib, o)?, + ], + )?; + ctx.getattr("add")?.call1((triple,))?; + } + } + + Ok(ds.into()) } // Config accessors @@ -1017,7 +1072,10 @@ impl OntoEnv { let guard = inner.lock().unwrap(); if let Some(env) = guard.as_ref() { match env.store_path() { - Some(path) => Ok(Some(path.to_string_lossy().to_string())), + Some(path) => { + let dir = path.parent().unwrap_or(path); + Ok(Some(dir.to_string_lossy().to_string())) + } None => Ok(None), // Return None if the path doesn't exist (e.g., temporary env) } } else { @@ -1066,6 +1124,7 @@ fn ontoenv(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_function(wrap_pyfunction!(run_cli, m)?)?; // add version attribute m.add("version", env!("CARGO_PKG_VERSION"))?; Ok(()) diff --git a/python/src/ontoenv.pyi b/python/src/ontoenv.pyi index 7dec9f7..567d680 100644 --- a/python/src/ontoenv.pyi +++ b/python/src/ontoenv.pyi @@ -1,25 +1,46 @@ from pathlib import Path -from typing import Optional, List, Union, Any +from typing import Optional, List, Union, Tuple from rdflib import Graph, Dataset -class Config: - """ - Configuration class for setting up the ontology environment. - - Attributes: - search_directories: Optional list of directories to search for ontologies. - require_ontology_names: Flag to require ontology names. - strict: Flag for strict mode. - offline: Flag to operate in offline mode. - resolution_policy: Policy for resolving ontologies. - root: Root directory for the environment. - includes: Optional list of patterns to include. - excludes: Optional list of patterns to exclude. - temporary: Flag to create a temporary environment. - no_search: Flag to disable searching for ontologies in local directories. - """ +# Exposed module metadata +version: str + + +class Ontology: + """Read-only view of ontology metadata.""" + + @property + def id(self) -> str: ... + + @property + def name(self) -> str: ... + + @property + def imports(self) -> List[str]: ... + + @property + def location(self) -> Optional[str]: ... + + @property + def last_updated(self) -> Optional[str]: ... + + @property + def version_properties(self) -> dict[str, str]: ... + + @property + def namespace_map(self) -> dict[str, str]: ... + + def __repr__(self) -> str: ... + + +class OntoEnv: + """Ontology environment for managing ontologies and graphs.""" + def __init__( self, + path: Optional[Union[str, Path]] = None, + recreate: bool = False, + read_only: bool = False, search_directories: Optional[List[str]] = None, require_ontology_names: bool = False, strict: bool = False, @@ -30,282 +51,71 @@ class Config: excludes: Optional[List[str]] = None, temporary: bool = False, no_search: bool = False, - ) -> None: - """ - Initialize the Config object with the given parameters. - """ - ... + ) -> None: ... -class OntoEnv: - """ - Ontology Environment class for managing ontologies. - - Attributes: - config: Optional configuration object. - path: Path to the ontology environment. - recreate: Flag to recreate the environment. - read_only: Flag to set the environment as read-only. - """ - def __init__( - self, - config: Optional[Config] = None, - path: Optional[Union[str, Path]] = None, - recreate: bool = False, - read_only: bool = False, - ) -> None: - """ - Initialize the OntoEnv object with the given parameters. - """ - ... - - def update(self) -> None: - """ - Update the ontology environment by reloading all ontologies. - """ - ... - - def __repr__(self) -> str: - """ - Return a string representation of the OntoEnv object. - """ - ... - - def import_graph(self, destination_graph: Any, uri: str) -> None: - """ - Import a graph from the given URI into the destination graph. - - Args: - destination_graph: The graph to import into. - uri: The URI of the graph to import. - """ - ... - - def list_closure(self, uri: str, recursion_depth: int = -1) -> List[str]: - """ - List the ontologies in the imports closure of the given ontology. - - Args: - uri: The URI of the ontology. - recursion_depth: The maximum depth for recursive import resolution. - Returns: - A list of ontology names in the closure. - """ - ... + def __repr__(self) -> str: ... + + def update(self, all: bool = False) -> None: ... + + def add(self, location: Union[str, Path], overwrite: bool = False, fetch_imports: bool = True) -> str: ... + + def add_no_imports(self, location: Union[str, Path], overwrite: bool = False) -> str: ... + + def get_graph(self, uri: str) -> Graph: ... + + def get_ontology(self, uri: str) -> Ontology: ... + + def get_ontology_names(self) -> List[str]: ... + + def get_importers(self, uri: str) -> List[str]: ... + + def list_closure(self, uri: str, recursion_depth: int = -1) -> List[str]: ... def get_closure( self, uri: str, - destination_graph: Optional[Any] = None, + destination_graph: Optional[Graph] = None, rewrite_sh_prefixes: bool = True, remove_owl_imports: bool = True, recursion_depth: int = -1, - ) -> tuple[Any, List[str]]: - """ - Merge all graphs in the imports closure of the given ontology into a single graph. - - Args: - uri: The URI of the ontology. - destination_graph: Optional graph to add the merged graph to. - rewrite_sh_prefixes: Flag to rewrite SH prefixes. - remove_owl_imports: Flag to remove OWL imports. - recursion_depth: The maximum depth for recursive import resolution. - Returns: - A tuple containing the merged graph and a list of ontology names in the closure. - """ - ... - - def dump(self, includes: Optional[str] = None) -> None: - """ - Print the contents of the OntoEnv. - - Args: - includes: Optional string to filter the output. - """ - ... - - def import_dependencies(self, graph: Any, recursion_depth: int = -1, fetch_missing: bool = False) -> List[str]: - """ - Import the dependencies of the given graph into the graph. - - Args: - graph: The graph to import dependencies into. - recursion_depth: The maximum depth for recursive import resolution. - fetch_missing: If True, will fetch ontologies that are not in the environment. - Returns: - A list of imported ontology names. - """ - ... + ) -> Tuple[Graph, List[str]]: ... + + def import_graph(self, destination_graph: Graph, uri: str) -> None: ... + + def import_dependencies(self, graph: Graph, recursion_depth: int = -1, fetch_missing: bool = False) -> List[str]: ... def get_dependencies_graph( self, - graph: Any, - destination_graph: Optional[Any] = None, + graph: Graph, + destination_graph: Optional[Graph] = None, recursion_depth: int = -1, fetch_missing: bool = False, rewrite_sh_prefixes: bool = True, remove_owl_imports: bool = True, - ) -> tuple[Any, List[str]]: - """ - Get the dependency closure of a given graph and return it as a new graph. - - This method will look for `owl:imports` statements in the provided `graph`, - then find those ontologies within the `OntoEnv` and compute the full - dependency closure. The triples of all ontologies in the closure are - returned as a new graph. The original graph is not modified. - - Args: - graph: The graph to find dependencies for. - destination_graph: If provided, the dependency graph will be added to this - graph instead of creating a new one. - recursion_depth: The maximum depth for recursive import resolution. A - negative value (default) means no limit. - fetch_missing: If True, will fetch ontologies that are not in the environment. - rewrite_sh_prefixes: If True, will rewrite SHACL prefixes to be unique. - remove_owl_imports: If True, will remove `owl:imports` statements from the - returned graph. - - Returns: - A tuple containing the graph of dependencies and a list of the URIs of the - imported ontologies. - """ - ... - - def add(self, location: Any, overwrite: bool = False, fetch_imports: bool = True) -> str: - """ - Add a new ontology to the OntoEnv. - - Args: - location: The location of the ontology to add (file path or URL). - overwrite: If True, will overwrite an existing ontology at the same location. - fetch_imports: If True, will recursively fetch missing owl:imports. - Returns: - The URI string of the added ontology. - """ - ... - - def add_no_imports(self, location: Any) -> str: - """ - Add a new ontology to the OntoEnv without exploring owl:imports. - - Args: - location: The location of the ontology to add (file path, URL, or rdflib.Graph). - Returns: - The URI string of the added ontology. - """ - ... - - def get_importers(self, uri: str) -> List[str]: - """ - Get the names of all ontologies that import the given ontology. - - Args: - uri: The URI of the ontology. - Returns: - A list of ontology names that import the given ontology. - """ - ... - - def get_graph(self, uri: str) -> Graph: - """ - Get the graph with the given URI as an rdflib.Graph. - - Args: - uri: The URI of the graph to get. - Returns: - An rdflib.Graph object representing the requested graph. - """ - ... - - def get_ontology_names(self) -> List[str]: - """ - Get the names of all ontologies in the OntoEnv. - - Returns: - A list of ontology names. - """ - ... - - def to_rdflib_dataset(self) -> Dataset: - """ - Convert the OntoEnv to an rdflib.Dataset. - """ - ... - - # Config accessors - def is_offline(self) -> bool: - """ - Checks if the environment is in offline mode. - """ - ... - - def set_offline(self, offline: bool) -> None: - """ - Sets the offline mode for the environment. - """ - ... - - def is_strict(self) -> bool: - """ - Checks if the environment is in strict mode. - """ - ... - - def set_strict(self, strict: bool) -> None: - """ - Sets the strict mode for the environment. - """ - ... - - def requires_ontology_names(self) -> bool: - """ - Checks if the environment requires unique ontology names. - """ - ... - - def set_require_ontology_names(self, require: bool) -> None: - """ - Sets whether the environment requires unique ontology names. - """ - ... - - def no_search(self) -> bool: - """ - Checks if the environment disables local file search. - """ - ... - - def set_no_search(self, no_search: bool) -> None: - """ - Sets whether the environment disables local file search. - """ - ... - - def resolution_policy(self) -> str: - """ - Returns the current resolution policy. - """ - ... - - def set_resolution_policy(self, policy: str) -> None: - """ - Sets the resolution policy for the environment. - """ - ... - - def store_path(self) -> Optional[str]: - """ - Returns the path to the underlying graph store, if applicable. - """ - ... - - def close(self) -> None: - """ - Closes the ontology environment, saving changes and flushing the store. - """ - ... - - def flush(self) -> None: - """ - Flushes any pending writes to the underlying graph store. - """ - ... + ) -> Tuple[Graph, List[str]]: ... + + def to_rdflib_dataset(self) -> Dataset: ... + + def dump(self, includes: Optional[str] = None) -> None: ... + + # Configuration accessors + def is_offline(self) -> bool: ... + def set_offline(self, offline: bool) -> None: ... + + def is_strict(self) -> bool: ... + def set_strict(self, strict: bool) -> None: ... + + def requires_ontology_names(self) -> bool: ... + def set_require_ontology_names(self, require: bool) -> None: ... + + def no_search(self) -> bool: ... + def set_no_search(self, no_search: bool) -> None: ... + + def resolution_policy(self) -> str: ... + def set_resolution_policy(self, policy: str) -> None: ... + + def store_path(self) -> Optional[str]: ... + + def flush(self) -> None: ... + def close(self) -> None: ... + diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/test_concurrency.py b/python/tests/test_concurrency.py similarity index 70% rename from python/test_concurrency.py rename to python/tests/test_concurrency.py index eb2153b..38e43f2 100644 --- a/python/test_concurrency.py +++ b/python/tests/test_concurrency.py @@ -52,6 +52,42 @@ def _rw_open_worker(path_str, graph_uri, result_queue): result_queue.put(("error", graph_uri, str(e))) +def _writer_hold_worker(path_str, hold_secs, graph_uri, result_queue): + try: + import time + from pathlib import Path + from ontoenv import OntoEnv + from rdflib import URIRef + from rdflib.namespace import RDF, OWL + + env = OntoEnv(path=Path(path_str)) + # Touch a known graph to ensure the store is usable + g = env.get_graph(graph_uri) + ok_graph = (URIRef(graph_uri), RDF.type, OWL.Ontology) in g and len(g) > 0 + + # Hold the exclusive writer lock for a bit + time.sleep(hold_secs) + env.close() + result_queue.put(("released", ok_graph)) + except Exception as e: + result_queue.put(("error", str(e))) + + +def _ro_open_get_graph_worker(path_str, graph_uri, result_queue): + try: + from pathlib import Path + from ontoenv import OntoEnv + from rdflib import URIRef + from rdflib.namespace import RDF, OWL + + env = OntoEnv(path=Path(path_str), read_only=True) + g = env.get_graph(graph_uri) + ok = (URIRef(graph_uri), RDF.type, OWL.Ontology) in g and len(g) > 0 + env.close() + result_queue.put(("ok", ok)) + except Exception as e: + result_queue.put(("error", str(e))) + class TestOntoEnvReadOnlyConcurrency(unittest.TestCase): def setUp(self): self.test_dir = Path("test_env_ro") @@ -89,7 +125,7 @@ def test_concurrent_read_only_open_same_store(self): ) # Create the store and add ontologies (single writer) - env = OntoEnv(path=self.test_dir) + env = OntoEnv(path=self.test_dir, recreate=True) name_a = env.add(str(a_path), fetch_imports=False) name_b = env.add(str(b_path), fetch_imports=False) self.assertEqual(name_a, a_uri) @@ -160,7 +196,7 @@ def test_concurrent_open_same_store(self): encoding="utf-8", ) - env = OntoEnv(path=self.test_dir) + env = OntoEnv(path=self.test_dir, recreate=True) name_a = env.add(str(a_path), fetch_imports=False) name_b = env.add(str(b_path), fetch_imports=False) self.assertEqual(name_a, a_uri) @@ -207,6 +243,53 @@ def test_concurrent_open_same_store(self): msg=f"Unexpected error message: {err_msg}", ) + def test_reader_waits_for_writer_then_reads(self): + """A read-only open should wait while a writer holds the exclusive lock, then succeed.""" + a_path = self.test_dir / "A.ttl" + a_uri = "http://example.org/ont/A" + a_path.write_text( + "@prefix rdf: .\n" + "@prefix owl: .\n" + f"<{a_uri}> a owl:Ontology .\n", + encoding="utf-8", + ) + env = OntoEnv(path=self.test_dir, recreate=True) + env.add(str(a_path), fetch_imports=False) + env.flush() + env.close() + + ctx = multiprocessing.get_context("spawn") + q = ctx.Queue() + hold_secs = 1.0 + writer = ctx.Process(target=_writer_hold_worker, args=(str(self.test_dir), hold_secs, a_uri, q)) + reader = ctx.Process(target=_ro_open_get_graph_worker, args=(str(self.test_dir), a_uri, q)) + + writer.start() + import time + time.sleep(0.15) # ensure writer started and holds the lock + t0 = time.time() + reader.start() + + # First result should be from reader after writer releases + r1 = q.get(timeout=30) + r2 = q.get(timeout=30) + + writer.join(timeout=30) + reader.join(timeout=30) + + self.assertFalse(writer.is_alive()) + self.assertFalse(reader.is_alive()) + self.assertEqual(writer.exitcode, 0) + self.assertEqual(reader.exitcode, 0) + + elapsed = time.time() - t0 + # Reader should have waited roughly the hold duration (minus start skew) + self.assertGreaterEqual(elapsed, 0.7) + + results = {r1[0], r2[0]} + self.assertIn("released", results) + self.assertIn("ok", results) + if __name__ == "__main__": unittest.main() diff --git a/python/tests/test_ontoenv.py b/python/tests/test_ontoenv.py deleted file mode 100644 index e7c2545..0000000 --- a/python/tests/test_ontoenv.py +++ /dev/null @@ -1,154 +0,0 @@ -import pytest -import ontoenv -from ontoenv import OntoEnv, Config -import pathlib -import shutil -import os - - -# Fixture to create a temporary directory for each test -@pytest.fixture -def temp_dir(tmp_path): - """Provides a temporary directory path for tests.""" - yield tmp_path - # Cleanup happens automatically via pytest's tmp_path fixture - - -# Fixture to create a temporary directory with a pre-initialized OntoEnv -@pytest.fixture -def existing_env_dir(tmp_path): - """Provides a temporary directory path with an initialized OntoEnv.""" - env_path = tmp_path / "existing_env" - env_path.mkdir() - # Use temporary=False explicitly if needed, ensure root is set - cfg = Config(root=str(env_path), temporary=False) - env = OntoEnv(config=cfg, path=env_path, recreate=True) - # Add a dummy file to ensure the env is not empty if needed later - # For now, just initializing is enough to create the .ontoenv structure - env.flush() # Ensure data is written if not temporary - del env - yield env_path - # Cleanup happens automatically via pytest's tmp_path fixture - - -def test_init_with_config_new_dir(temp_dir): - """Test initializing OntoEnv with a Config in a new directory.""" - env_path = temp_dir / "new_env" - # Ensure the directory does not exist initially - assert not env_path.exists() - cfg = Config(root=str(env_path), temporary=False) - env = OntoEnv(config=cfg, path=env_path, recreate=True) - assert (env_path / ".ontoenv").is_dir() - assert ( - env.store_path() is not None - ) # Assuming store_path handles non-temporary envs - - -def test_init_with_config_existing_empty_dir(temp_dir): - """Test initializing OntoEnv with a Config in an existing empty directory.""" - env_path = temp_dir / "empty_env" - env_path.mkdir() - assert env_path.is_dir() - cfg = Config(root=str(env_path), temporary=False) - env = OntoEnv(config=cfg, path=env_path, recreate=True) - assert (env_path / ".ontoenv").is_dir() - assert env.store_path() is not None - - -def test_init_load_from_existing_dir(existing_env_dir): - """Test initializing OntoEnv by loading from an existing directory.""" - assert (existing_env_dir / ".ontoenv").is_dir() - # Initialize by path only, should load existing - env = OntoEnv(path=existing_env_dir, read_only=False) - # Simple check: does it have a store path? - assert env.store_path() == str(existing_env_dir / ".ontoenv" / "store.db") - # Add more checks if the fixture pre-populates data - - -def test_init_recreate_existing_dir(existing_env_dir): - """Test initializing OntoEnv with recreate=True on an existing directory.""" - assert (existing_env_dir / ".ontoenv").is_dir() - # Optionally: Add a dummy file inside .ontoenv to check if it gets wiped - (existing_env_dir / ".ontoenv" / "dummy.txt").touch() - assert (existing_env_dir / ".ontoenv" / "dummy.txt").exists() - - # Recreate the environment - cfg = Config(root=str(existing_env_dir), temporary=False) - env = OntoEnv(config=cfg, path=existing_env_dir, recreate=True) - - assert (existing_env_dir / ".ontoenv").is_dir() - # Check if the dummy file is gone (or check if ontology list is empty) - assert not (existing_env_dir / ".ontoenv" / "dummy.txt").exists() - assert len(env.get_ontology_names()) == 0 - - -# Note: This test assumes add() raises an error for read-only mode. -# The Rust ReadOnlyPersistentGraphIO::add returns Err, which should map to PyErr. -def test_init_read_only(existing_env_dir): - """Test initializing OntoEnv with read_only=True.""" - env = OntoEnv(path=existing_env_dir, read_only=True) - assert (existing_env_dir / ".ontoenv").is_dir() - - # Attempting to modify should fail - with pytest.raises(ValueError, match="Cannot add to read-only store"): - # Use a dummy file path or URL - env.add("file:///dummy.ttl") - - -def test_init_no_config_no_path_error(): - """Test initializing OntoEnv without config or valid path fails.""" - # Assuming current dir '.' does not contain a valid .ontoenv - # Clean up potential leftover .ontoenv in cwd just in case - if os.path.exists(".ontoenv"): - if os.path.isfile(".ontoenv"): - os.remove(".ontoenv") - else: - shutil.rmtree(".ontoenv") - - # Expecting failure because '.' likely doesn't contain a valid .ontoenv - with pytest.raises(ValueError, match="OntoEnv directory not found at: \"./.ontoenv\""): - OntoEnv() # No args - - -def test_init_path_no_env_error(temp_dir): - """Test initializing OntoEnv with a path to a dir without .ontoenv fails.""" - env_path = temp_dir / "no_env_here" - env_path.mkdir() - assert not (env_path / ".ontoenv").exists() - # Expecting failure because the specified path doesn't contain a .ontoenv dir - absolute_path = (env_path / ".ontoenv").resolve() - with pytest.raises(ValueError, match=f"OntoEnv directory not found at: \"{absolute_path}\""): - # This fails because load_from_directory expects .ontoenv unless recreate=True - OntoEnv(path=env_path) - - -def test_init_temporary(temp_dir): - """Test initializing OntoEnv with temporary=True.""" - env_path = temp_dir / "temp_env_root" - # temporary envs don't persist to disk relative to root - cfg = Config(root=str(env_path), temporary=True, strict=False) - env = OntoEnv(config=cfg) # Path shouldn't matter for temporary - - # .ontoenv directory should NOT be created at the root - assert not (env_path / ".ontoenv").exists() - - # store_path() should indicate it's not persistent. - # store_path() should return None for temporary envs - assert env.store_path() is None - - # Check if adding works in memory (should not raise read-only error) - # Note: Adding a URL might fail if offline=True by default or network issues - # Adding a non-existent file path will fail regardless. - # We'll just check that the *attempt* doesn't raise a read-only error. - try: - # Use a dummy URL that won't resolve but tests the add path - env.add("http://example.com/nonexistent.ttl") - except ValueError as e: - # We expect errors related to fetching/reading, *not* read-only errors - assert "Cannot add to read-only store" not in str(e) - except Exception: - # Catch other potential errors (like network) during add - pass - - -# TODO: Add tests for offline mode, different resolution policies, includes/excludes etc. diff --git a/python/test.py b/python/tests/test_ontoenv_api.py similarity index 86% rename from python/test.py rename to python/tests/test_ontoenv_api.py index 7b08e43..5b2710e 100644 --- a/python/test.py +++ b/python/tests/test_ontoenv_api.py @@ -1,4 +1,5 @@ import unittest +import os import shutil from pathlib import Path from ontoenv import OntoEnv @@ -36,19 +37,20 @@ def tearDown(self): shutil.rmtree(".ontoenv") def test_constructor_default(self): - """Test default OntoEnv() constructor.""" - self.env = OntoEnv() - self.assertTrue(Path(".ontoenv").is_dir()) - self.assertIn("ontologies", repr(self.env)) - + """Test default OntoEnv() constructor respects git-style discovery.""" + with self.assertRaises(ValueError): + OntoEnv() + self.env = OntoEnv(temporary=True) + self.assertIn("OntoEnv", repr(self.env)) + def test_constructor_path(self): """Test OntoEnv(path=...) constructor.""" - self.env = OntoEnv(path=self.test_dir) + self.env = OntoEnv(path=self.test_dir, recreate=True) self.assertTrue((self.test_dir / ".ontoenv").is_dir()) def test_constructor_with_config(self): """Test OntoEnv(...flags...) constructor.""" - self.env = OntoEnv(path=self.test_dir, search_directories=["../brick"]) + self.env = OntoEnv(path=self.test_dir, recreate=True, search_directories=["../brick"]) self.env.update() # discover ontologies ontologies = self.env.get_ontology_names() self.assertIn(self.brick_name, ontologies) @@ -56,7 +58,7 @@ def test_constructor_with_config(self): def test_add_local_file(self): """Test env.add() with a local file and fetching imports.""" # requires offline=False to fetch QUDT from web - self.env = OntoEnv(path=self.test_dir, offline=False) + self.env = OntoEnv(path=self.test_dir, recreate=True, offline=False) name = self.env.add(str(self.brick_file_path)) self.assertEqual(name, self.brick_name) ontologies = self.env.get_ontology_names() @@ -66,7 +68,7 @@ def test_add_local_file(self): def test_add_url(self): """Test env.add() with a URL.""" - self.env = OntoEnv(path=self.test_dir, offline=False) + self.env = OntoEnv(path=self.test_dir, recreate=True, offline=False) name = self.env.add(self.brick_144_url) self.assertEqual(name, self.brick_144_name) ontologies = self.env.get_ontology_names() @@ -76,7 +78,7 @@ def test_add_url(self): def test_add_no_fetch_imports(self): """Test env.add() with fetch_imports=False.""" - self.env = OntoEnv(path=self.test_dir) + self.env = OntoEnv(path=self.test_dir, recreate=True) # With fetch_imports=False, Brick should be added but its dependencies # should not be processed. name = self.env.add(str(self.brick_file_path), fetch_imports=False) @@ -88,7 +90,7 @@ def test_add_no_fetch_imports(self): def test_get_graph(self): """Test env.get_graph().""" - self.env = OntoEnv(path=self.test_dir) + self.env = OntoEnv(path=self.test_dir, recreate=True) name = self.env.add(str(self.brick_file_path)) g = self.env.get_graph(name) self.assertIsInstance(g, Graph) @@ -97,7 +99,7 @@ def test_get_graph(self): def test_get_closure(self): """Test env.get_closure().""" - self.env = OntoEnv(path=self.test_dir, search_directories=["brick"]) + self.env = OntoEnv(path=self.test_dir, recreate=True, search_directories=["brick"]) name = self.env.add(str(self.brick_file_path)) g = self.env.get_graph(name) closure_g, imported_graphs = self.env.get_closure(name, recursion_depth=0) @@ -111,7 +113,7 @@ def test_get_closure(self): def test_import_dependencies(self): """Test env.import_dependencies().""" - self.env = OntoEnv(path=self.test_dir, search_directories=["brick"]) + self.env = OntoEnv(path=self.test_dir, recreate=True, search_directories=["brick"]) self.env.add(str(self.brick_file_path)) g = Graph() @@ -131,8 +133,8 @@ def test_import_dependencies_fetch_missing(self): """Test env.import_dependencies() with fetch_missing=True.""" # offline=False is required to fetch from URL # empty env - self.env = OntoEnv(path=self.test_dir, offline=False) - + self.env = OntoEnv(path=self.test_dir, recreate=True, offline=False) + g = Graph() # Add an import to a known ontology URL that is not in the environment g.add( @@ -158,7 +160,7 @@ def test_import_dependencies_fetch_missing(self): def test_list_closure(self): """Test env.list_closure().""" - self.env = OntoEnv(path=self.test_dir, search_directories=["brick"]) + self.env = OntoEnv(path=self.test_dir, recreate=True, search_directories=["brick"]) name = self.env.add(str(self.brick_file_path)) closure_list = self.env.list_closure(name) self.assertIn(name, closure_list) @@ -168,7 +170,7 @@ def test_list_closure(self): def test_get_importers(self): """Test env.get_importers().""" - self.env = OntoEnv(path=self.test_dir, search_directories=["brick"]) + self.env = OntoEnv(path=self.test_dir, recreate=True, search_directories=["brick"]) self.env.add(str(self.brick_file_path)) dependents = self.env.get_importers("http://qudt.org/2.1/vocab/quantitykind") @@ -176,7 +178,7 @@ def test_get_importers(self): def test_to_rdflib_dataset(self): """Test env.to_rdflib_dataset().""" - self.env = OntoEnv(path=self.test_dir, search_directories=["brick"]) + self.env = OntoEnv(path=self.test_dir, recreate=True, search_directories=["brick"]) self.env.add(str(self.brick_file_path)) self.env.update() # need to run update to find all dependencies self.env.flush() @@ -189,7 +191,7 @@ def test_to_rdflib_dataset(self): def test_import_graph(self): """Test env.import_graph().""" - self.env = OntoEnv(path=self.test_dir, offline=False) + self.env = OntoEnv(path=self.test_dir, recreate=True, offline=False) name = self.env.add(self.brick_144_url) self.assertEqual(name, self.brick_144_name) @@ -200,7 +202,7 @@ def test_import_graph(self): def test_store_path(self): """Test env.store_path().""" - self.env = OntoEnv(path=self.test_dir) + self.env = OntoEnv(path=self.test_dir, recreate=True) path = self.env.store_path() self.assertIsNotNone(path) self.assertTrue(Path(path).is_dir()) @@ -213,7 +215,7 @@ def test_store_path(self): def test_persistence(self): """Test that the environment is persisted to disk.""" - env = OntoEnv(path=self.test_dir) + env = OntoEnv(path=self.test_dir, recreate=True) name = env.add(str(self.brick_file_path)) self.assertIn(name, env.get_ontology_names()) env.flush() # ensure everything is written to disk @@ -227,7 +229,7 @@ def test_persistence(self): def test_close(self): """Test that the environment can be closed and methods fail.""" - self.env = OntoEnv(path=self.test_dir) + self.env = OntoEnv(path=self.test_dir, recreate=True) name = self.env.add(str(self.brick_file_path)) self.assertIn(name, self.env.get_ontology_names()) self.env.close() @@ -257,7 +259,7 @@ def test_close(self): def test_get_dependencies_graph(self): """Test env.get_dependencies_graph().""" - self.env = OntoEnv(path=self.test_dir, offline=False) + self.env = OntoEnv(path=self.test_dir, recreate=True, offline=False) self.env.add(str(self.brick_file_path)) g = Graph() @@ -292,7 +294,7 @@ def test_get_dependencies_graph(self): def test_update_all_flag(self): """Test env.update(all=True) forces reloading of all ontologies.""" - self.env = OntoEnv(path=self.test_dir, search_directories=["../brick"]) + self.env = OntoEnv(path=self.test_dir, recreate=True, search_directories=["../brick"]) # Initial discovery of ontologies self.env.update() self.assertIn(self.brick_name, self.env.get_ontology_names()) diff --git a/python/tests/test_ontoenv_init.py b/python/tests/test_ontoenv_init.py new file mode 100644 index 0000000..6a99622 --- /dev/null +++ b/python/tests/test_ontoenv_init.py @@ -0,0 +1,107 @@ +import unittest +import pathlib +import shutil +import os +import tempfile +import re +from pathlib import Path +from ontoenv import OntoEnv + + +class TestOntoEnvInit(unittest.TestCase): + def tearDown(self): + # Clean up any accidental .ontoenv in cwd + if Path(".ontoenv").exists(): + shutil.rmtree(".ontoenv") + + def test_init_recreate_new_dir(self): + with tempfile.TemporaryDirectory() as td: + root = Path(td) + env_path = root / "new_env" + self.assertFalse(env_path.exists()) + env = OntoEnv(path=env_path, recreate=True) + self.assertTrue((env_path / ".ontoenv").is_dir()) + sp = env.store_path() + self.assertIsNotNone(sp) + self.assertTrue(Path(sp).is_dir()) + + def test_init_recreate_existing_empty_dir(self): + with tempfile.TemporaryDirectory() as td: + env_path = Path(td) / "empty_env" + env_path.mkdir() + self.assertTrue(env_path.is_dir()) + env = OntoEnv(path=env_path, recreate=True) + self.assertTrue((env_path / ".ontoenv").is_dir()) + self.assertIsNotNone(env.store_path()) + + def test_init_load_from_existing_dir(self): + with tempfile.TemporaryDirectory() as td: + env_path = Path(td) / "existing_env" + env_path.mkdir() + env = OntoEnv(path=env_path, recreate=True) + env.flush() + del env + # load existing + env2 = OntoEnv(path=env_path, read_only=False) + self.assertEqual(env2.store_path(), str(env_path / ".ontoenv")) + + def test_init_recreate_existing_dir(self): + with tempfile.TemporaryDirectory() as td: + env_path = Path(td) / "existing_env" + env_path.mkdir() + env = OntoEnv(path=env_path, recreate=True) + (env_path / ".ontoenv" / "dummy.txt").touch() + self.assertTrue((env_path / ".ontoenv" / "dummy.txt").exists()) + # Recreate + env = OntoEnv(path=env_path, recreate=True) + self.assertTrue((env_path / ".ontoenv").is_dir()) + self.assertFalse((env_path / ".ontoenv" / "dummy.txt").exists()) + self.assertEqual(len(env.get_ontology_names()), 0) + + def test_init_read_only(self): + with tempfile.TemporaryDirectory() as td: + env_path = Path(td) / "existing_env" + env_path.mkdir() + env1 = OntoEnv(path=env_path, recreate=True) + env1.close() + env = OntoEnv(path=env_path, read_only=True) + self.assertTrue((env_path / ".ontoenv").is_dir()) + with self.assertRaisesRegex(ValueError, "Cannot add to read-only store"): + env.add("file:///dummy.ttl") + + def test_init_no_config_no_path_error(self): + # Clean up potential leftover .ontoenv in cwd just in case + if os.path.exists(".ontoenv"): + if os.path.isfile(".ontoenv"): + os.remove(".ontoenv") + else: + shutil.rmtree(".ontoenv") + with self.assertRaisesRegex(ValueError, "OntoEnv directory not found at: \"./.ontoenv\""): + OntoEnv() # No args + + def test_init_path_no_env_error(self): + with tempfile.TemporaryDirectory() as td: + env_path = Path(td) / "no_env_here" + env_path.mkdir() + self.assertFalse((env_path / ".ontoenv").exists()) + # Be tolerant of macOS /private prefix differences by matching only the tail. + tail_pattern = rf'OntoEnv directory not found at: "(.*/)?{re.escape(env_path.name)}/\.ontoenv"' + with self.assertRaisesRegex(ValueError, tail_pattern): + OntoEnv(path=env_path) + + def test_init_temporary(self): + with tempfile.TemporaryDirectory() as td: + env_path = Path(td) / "temp_env_root" + env = OntoEnv(temporary=True, root=str(env_path), strict=False) + self.assertFalse((env_path / ".ontoenv").exists()) + self.assertIsNone(env.store_path()) + try: + env.add("http://example.com/nonexistent.ttl") + except ValueError as e: + self.assertNotIn("Cannot add to read-only store", str(e)) + except Exception: + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/rdf5d/.github/workflows/ci.yml b/rdf5d/.github/workflows/ci.yml new file mode 100644 index 0000000..be4eec2 --- /dev/null +++ b/rdf5d/.github/workflows/ci.yml @@ -0,0 +1,154 @@ +name: CI + +on: + push: + branches: [ main, master ] + tags: [ 'v*.*.*' ] + pull_request: + +jobs: + test: + name: "Test (${{ matrix.os }} | features: ${{ matrix.features }})" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + features: ["", "oxigraph", "oxigraph,mmap", "zstd", "oxigraph,zstd", "oxigraph,mmap,zstd"] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: Build + shell: bash + run: | + if [ -z "${{ matrix.features }}" ]; then + cargo build --all-targets --locked + else + cargo build --all-targets --locked --features "${{ matrix.features }}" + fi + + - name: Test + shell: bash + run: | + if [ -z "${{ matrix.features }}" ]; then + cargo test --locked + else + cargo test --locked --features "${{ matrix.features }}" + fi + + lint: + name: Lint (clippy + fmt) + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust (components) + uses: dtolnay/rust-toolchain@stable + with: + components: clippy, rustfmt + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-lint-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo- + + - name: cargo fmt --check + run: cargo fmt --all -- --check + + - name: cargo clippy + run: cargo clippy --all-targets -- -D warnings + + publish: + name: Publish (crates.io) + runs-on: ubuntu-latest + needs: [test, lint] + if: startsWith(github.ref, 'refs/tags/v') + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Publish crate + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} + run: | + cargo publish --locked + + build-cli: + name: Build CLI Artifacts (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + needs: [test] + if: startsWith(github.ref, 'refs/tags/v') + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Build CLI (release) + run: cargo build --release --bin r5tu --features "oxigraph,mmap" --locked + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: r5tu-${{ matrix.os }} + path: | + target/release/r5tu* + + release: + name: GitHub Release + runs-on: ubuntu-latest + needs: [test, lint, build-cli] + if: startsWith(github.ref, 'refs/tags/v') + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Extract tag + id: vars + run: echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ steps.vars.outputs.tag }} + name: ${{ steps.vars.outputs.tag }} + draft: false + prerelease: ${{ contains(steps.vars.outputs.tag, '-') }} + files: | + artifacts/** + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/rdf5d/.gitignore b/rdf5d/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/rdf5d/.gitignore @@ -0,0 +1 @@ +/target diff --git a/rdf5d/AGENTS.md b/rdf5d/AGENTS.md new file mode 100644 index 0000000..d866cf8 --- /dev/null +++ b/rdf5d/AGENTS.md @@ -0,0 +1,34 @@ +# Repository Guidelines + +## Project Structure & Module Organization +- Root crate: `Cargo.toml` (Rust edition 2024). +- Library code lives in `src/` (entry: `src/lib.rs`). Keep modules small and cohesive (e.g., `src/codec/`, `src/dict/`). +- Unit tests sit next to code via `#[cfg(test)]` blocks; add integration tests under `tests/` when behavior spans modules. +- Architecture reference: see `ARCH.md` for on‑disk format details and terminology. + +## Build, Test, and Development Commands +- Build: `cargo build` — compile debug build; `cargo build --release` for optimized. +- Test: `cargo test` — run unit/integration tests. +- Format: `cargo fmt --all` — apply `rustfmt` to the whole workspace. +- Lint: `cargo clippy --all-targets -- -D warnings` — static checks; treat warnings as errors. + +## Coding Style & Naming Conventions +- Indentation: 4 spaces; no tabs. Always run `cargo fmt` before committing. +- Naming: `snake_case` for modules/files/functions; `CamelCase` for types/traits; `SCREAMING_SNAKE_CASE` for constants. +- Error handling: prefer `Result` with descriptive error enums; avoid panics in library code. +- Docs: add `///` rustdoc on public items; include small examples where practical. + +## Testing Guidelines +- Unit tests colocated with modules for tight feedback; name tests after behavior (e.g., `encodes_empty_dict`). +- Integration tests in `tests/` mirror public API surface; one file per feature area (e.g., `tests/dict_roundtrip.rs`). +- Edge cases: cover empty inputs, max sizes, and invalid headers described in `ARCH.md`. +- Run: `cargo test`; aim to keep tests deterministic and mmap‑safe (no flaky fs assumptions). + +## Commit & Pull Request Guidelines +- Messages: use Conventional Commits (e.g., `feat(dict): add key16 index`, `fix(io): handle LE offset overflow`). +- PRs: include a clear description, link related issues, reference `ARCH.md` sections when relevant, add/adjust tests, and attach benchmarks if performance‑affecting. +- Pre‑PR checklist: `cargo fmt`, `cargo clippy -- -D warnings`, `cargo test` all passing. + +## Security & Configuration Tips +- Use stable Rust with up‑to‑date toolchain via `rustup`; enable `mmap`‑related tests only on supported platforms. +- When touching file I/O or offsets, validate sizes and bounds; prefer checked math and fuzz tests for parsers. diff --git a/rdf5d/ARCH.md b/rdf5d/ARCH.md new file mode 100644 index 0000000..fe3f0f0 --- /dev/null +++ b/rdf5d/ARCH.md @@ -0,0 +1,321 @@ +# R5TU v0 — An HDT-inspired, mmap-friendly on-disk format for RDF 5-tuples + +**Purpose:** an efficient, immutable serialization for datasets of RDF **5-tuples** +`(id, subject, predicate, object, graphname)` optimized for: +- fast **enumeration** of graphs by `id`, `graphname`, or their pair, +- fast **loading** of an entire `(id, graphname)` graph into memory, +- **many-readers / one-writer** with atomic finalize + `mmap` reading, +- **HDT-like** global term dictionary + compressed per-graph SPO blocks. + +--- + +## 0) Quick Glossary + +- **TermID** — integer assigned to a unique RDF term (IRI | BNODE | LITERAL). +- **id_id** — integer ID for the `id` string (e.g., source/file path). +- **gn_id** — integer ID for the `graphname` string. +- **GID** — graph instance ordinal (row index in Graph Directory). +- **uvarint** — unsigned LEB128 variable-length integer. + +--- + +## 1) File Overview + +All multi-byte fixed-size integers are **little-endian**. +All variable-length integers are **LEB128 unsigned (uvarint)**. +All offsets are **absolute** (from file start). + +``` + ++----------------------+ 0x00 +\| Header (fixed 32 B) | ++----------------------+ +\| TOC (array of 32 B) | ++----------------------+ +\| Sections... | ++----------------------+ +\| Footer (16 B) | ++----------------------+ + +``` + +### 1.1 Header (32 bytes) + +| Field | Type | Notes | +|-------------------|-------|----------------------------------------| +| `magic` | [u8;4]| `"R5TU"` | +| `version_u16` | u16 | `0x0001` | +| `flags_u16` | u16 | bit0=utf8, bit1=zstd, bit2=pos_perm | +| `created_unix64` | u64 | seconds since epoch | +| `toc_off_u64` | u64 | byte offset to TOC | +| `toc_len_u32` | u32 | number of TOC entries | +| `reserved_u32` | u32 | 0 | + +### 1.2 TOC entry (32 bytes each) + +| Field | Type | Notes | +|---------------|------|----------------------------------------| +| `kind_u16` | u16 | `SectionKind` | +| `reserved_u16`| u16 | 0 | +| `off_u64` | u64 | section start | +| `len_u64` | u64 | section length | +| `crc32_u32` | u32 | (optional in v0) | +| `reserved_u32`| u32 | 0 | + +**SectionKind:** +``` + +1 TERM\_DICT | 2 ID\_DICT | 3 GNAME\_DICT +4 GDIR | 5 IDX\_ID2GID | 6 IDX\_GNAME2GID +7 IDX\_PAIR2GID | 8 TRIPLE\_BLOCKS + +``` + +### 1.3 Footer (16 bytes) + +| Field | Type | Notes | +|------------------|------|-----------------------------------------------| +| `global_crc32` | u32 | CRC over [0 .. footer_off) | +| `eof_magic[12]` | u8 | `"R5TU_ENDMARK"` | + +> **Writer rule:** write to temp file, then atomic rename. + +--- + +## 2) Sections (v0 encodings) + +### 2.1 String Dictionaries — `ID_DICT`, `GNAME_DICT` + +Simple O(1) ID→string plus an optional coarse index for string→ID. +Reader may implement string→ID using the coarse index; can be upgraded to FST later. + +**Layout:** +``` + +DICT: +u32 n\_entries +u64 str\_bytes\_off --> \[UTF-8 bytes...] +u64 str\_bytes\_len +u64 offs\_off --> \[u32 \* (n\_entries+1)] +u64 offs\_len +u64 idx\_off (0 if absent) --> \[IndexEntry \* n\_entries] +u64 idx\_len + +``` + +- **Blob:** concatenation of all strings. +- **Offsets:** `offs[i]` start of string i in blob; `offs[n]=blob_len`. +- **IndexEntry (24 bytes):** + - `key16[16]` — lowercased first up-to-16 bytes of string, zero-padded. + - `id_u32` — the entry’s ordinal. + +**Operations:** +- ID→string: slice `blob[offs[i]..offs[i+1]]`. +- string→ID: binary search `key16` then string-compare inside blob. + +> Future: replace or augment with mmap-able FSTs for perfect lookups. + +--- + +### 2.2 Global Term Dictionary — `TERM_DICT` + +Maps unique RDF terms to `TermID`. `width_u8` = 4 or 8 reserved; v0 decodes payloads as UTF-8 + LEB128. + +**Layout:** +``` + +u8 width // 4 or 8 (reserved) +u64 n\_terms +u64 kinds\_off --> \[u8 \* n] // 0=IRI, 1=BNODE, 2=LITERAL +u64 data\_off --> \[bytes ...] payload blob +u64 offs\_off --> \[u64 \* (n+1)] + +``` + +**Payload per term kind:** +- **IRI/BNODE:** raw UTF-8 bytes. +- **LITERAL:** concatenation of + - `lex_len:uvarint` + `lex_bytes` + - `has_dt:u8` + if 1: `dt_len:uvarint` + `dt_bytes` + - `has_lang:u8` + if 1: `lang_len:uvarint` + `lang_bytes` + +--- + +### 2.3 Graph Directory — `GDIR` + +One fixed row per graph (GID = row index). Sorted by `(id_id, gn_id)` at build time. + +**Header (16 bytes):** +``` + +u64 n\_rows +u32 row\_size = 56 +u32 reserved = 0 + +``` + +**Row (56 bytes):** +``` + +u32 id\_id +u32 gn\_id +u64 triples\_off +u64 triples\_len +u64 n\_triples +u32 n\_s +u32 n\_p +u32 n\_o + +``` + +Counts are hints only. + +--- + +### 2.4 Postings Indexes — `IDX_ID2GID`, `IDX_GNAME2GID` + +Delta-varint postings (v0); can be swapped for Elias–Fano or Roaring later. + +**Layout:** +``` + +u64 n\_keys // number of id\_ids or gn\_ids +u64 key2post\_offs\_off --> \[u64\*(n\_keys+1)] // per-key slice into blob +u64 gids\_blob\_off --> \[bytes...] // concatenated postings + +``` + +**Per posting list encoding:** +``` + +uvarint n +uvarint first\_gid +uvarint delta\_1 +... +uvarint delta\_(n-1) // strictly ascending + +``` + +--- + +### 2.5 Pair Index — `IDX_PAIR2GID` + +Sorted fixed-width mapping for `(id_id, gn_id) → gid`. + +**Layout:** +``` + +u64 n\_pairs +u64 pairs\_off -> \[PairEntry \* n\_pairs] sorted by (id\_id, gn\_id) + +``` + +**PairEntry (16 bytes):** `u32 id_id | u32 gn_id | u64 gid` + +--- + +### 2.6 Per-Graph Triples — `TRIPLE_BLOCKS` + +One block per GID. Each block is either raw or zstd-framed. + +**Block header:** +``` + +u8 enc // 0=RAW, 1=ZSTD +u32 raw\_len // length of RAW payload; for ZSTD, compressed len (sanity) +\[ payload ... ] // RAW or ZSTD frame containing RAW bytes + +``` + +**RAW payload (CSR-like SPO):** +``` + +uvarint nS +uvarint nP // total distinct (S,P) +uvarint nT // triples + +S\_vals\[nS] : uvarint (TermID), delta-coded ascending +S\_heads\[nS+1]: uvarint prefix sums into P\_vals (0..nP) +P\_vals\[nP] : uvarint (TermID), delta-coded per S-run +P\_heads\[nP+1]: uvarint prefix sums into O\_vals (0..nT) +O\_vals\[nT] : uvarint (TermID), delta-coded per (S,P)-run + +```` + +> Future: optional POS permutation appended if `flags.bit2==1`. + +--- + +## 3) Writer Spec (Build Pipeline) + +Input: list of quintuples `(id_str, s_term, p_term, o_term, gname_str)`. + +1. **Assign IDs** + - Deduplicate `id_str` → `id_id` (u32) + - Deduplicate `gname_str` → `gn_id` (u32) + - Deduplicate RDF Terms → `TermID` (u64 ok; u32 if guaranteed <4B) + +2. **Group & Sort** + - Group by pair `(id_id, gn_id)`. + - Inside each group, **sort by (S, P, O)** with their TermIDs. + +3. **Emit `TRIPLE_BLOCKS`** + - For each group (becomes a GID), build arrays: `S_vals`, `S_heads`, `P_vals`, `P_heads`, `O_vals` from sorted triples; encode per §2.6. + - Optionally wrap RAW bytes in ZSTD (independent frame per graph) if `flags.bit1`. + +4. **Emit `GDIR`** rows in the same order graphs are written. Row fields: + - `id_id`, `gn_id`, `triples_off`, `triples_len`, `n_triples`, `n_s`, `n_p`, `n_o`. + +5. **Emit `TERM_DICT`** + - Sort terms by assigned TermID; store kinds and payload slices with offsets. + +6. **Emit `ID_DICT` & `GNAME_DICT`** + - Build blobs + offsets; optionally include coarse index (key16 + id). + +7. **Emit postings `IDX_ID2GID` & `IDX_GNAME2GID`** + - For each `id_id`: collect sorted list of GIDs where it appears; delta-uvarint encode. + - For each `gn_id`: same. + +8. **Emit `IDX_PAIR2GID`** + - For each `(id_id, gn_id)` group: write a `PairEntry` sorted by `(id_id, gn_id)`. + +9. **TOC & Footer** + - Write TOC with offsets/lengths; optional per-section CRCs. + - Compute `global_crc32` and write Footer. + - **Atomic rename** temp → final. + +**Invariants & Validation:** +- Posting lists must be strictly increasing. +- `GDIR.n_rows == number of groups`. +- `TRIPLE_BLOCKS` offsets/lengths must not overlap. +- If `flags.bit1==1` then block `enc` may be 0 or 1; if 0, it is raw. + +--- + +## 4) Reader Spec & Minimal Rust Reference + +### 4.1 Public Reader API (suggested) + +```rust +pub struct R5tuFile { /* mmaps + parsed sections */ } + +pub struct GraphRef { + pub gid: u64, + pub id: String, + pub graphname: String, + pub n_triples: u64, +} + +impl R5tuFile { + pub fn open(path: &Path) -> Result; + pub fn enumerate_by_id(&self, id: &str) -> Result>; + pub fn enumerate_by_graphname(&self, gname: &str) -> Result>; + pub fn resolve_gid(&self, id: &str, gname: &str) -> Result>; + pub fn triples_ids(&self, gid: u64) -> Result>; + pub fn term_to_string(&self, term_id: u64) -> Result; +} +``` + +--- + +This design borrows the core ideas of **HDT** (global dictionary + compressed triples), adapted to per-graph blocks and 5-tuple needs. See RDF/HDT for background; implementation is original for R5TU. Created with ChatGPT-5 diff --git a/rdf5d/Cargo.lock b/rdf5d/Cargo.lock new file mode 100644 index 0000000..1520a6a --- /dev/null +++ b/rdf5d/Cargo.lock @@ -0,0 +1,1119 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anstream" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "itertools", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn", +] + +[[package]] +name = "bitflags" +version = "2.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "cc" +version = "1.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42bc4aea80032b7bf409b0bc7ccad88853858911b7713a8062fdc0623867bedc" +dependencies = [ + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + +[[package]] +name = "clap" +version = "4.5.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c5e4fcf9c21d2e544ca1ee9d8552de13019a42aa7dbf32747fa7aaf1df76e57" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fecb53a0e6fcfb055f686001bc2e2592fa527efaf38dbe81a6a9563562e57d41" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14cb31bb0a7d536caef2639baa7fad459e15c3144efefa6dbd1c84562c4739f6" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.1+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.3+wasi-0.2.4", +] + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.3", + "libc", +] + +[[package]] +name = "json-event-parser" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73267b6bffa5356bd46cfa89386673e9a7f62f4eb3adcb45b1bd031892357853" + +[[package]] +name = "libc" +version = "0.2.175" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" + +[[package]] +name = "libloading" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" +dependencies = [ + "cfg-if", + "windows-targets 0.53.3", +] + +[[package]] +name = "lock_api" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "memmap2" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" +dependencies = [ + "libc", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "oxigraph" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b57a5334aab94d88e1d24b238c093c5efb0d309614b16ac920f23ad77ee77d" +dependencies = [ + "dashmap", + "getrandom 0.2.16", + "libc", + "oxiri", + "oxrdf", + "oxrdfio", + "oxrocksdb-sys", + "oxsdatatypes", + "rand", + "rustc-hash", + "siphasher", + "sparesults", + "spareval", + "spargebra", + "thiserror", +] + +[[package]] +name = "oxilangtag" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23f3f87617a86af77fa3691e6350483e7154c2ead9f1261b75130e21ca0f8acb" +dependencies = [ + "serde", +] + +[[package]] +name = "oxiri" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b4ed3a7192fa19f5f48f99871f2755047fabefd7f222f12a1df1773796a102" + +[[package]] +name = "oxjsonld" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13a1a66dc569350f3f4e5eff8a8e1a72b0c9e6ad395bb5805493cb7a2fda185f" +dependencies = [ + "json-event-parser", + "oxiri", + "oxrdf", + "thiserror", +] + +[[package]] +name = "oxrdf" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a04761319ef84de1f59782f189d072cbfc3a9a40c4e8bded8667202fbd35b02a" +dependencies = [ + "oxilangtag", + "oxiri", + "oxsdatatypes", + "rand", + "thiserror", +] + +[[package]] +name = "oxrdfio" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14d33dd87769786a0bb7de342865e33bf0c6e9872fa76f1ede23e944fdc77898" +dependencies = [ + "oxjsonld", + "oxrdf", + "oxrdfxml", + "oxttl", + "thiserror", +] + +[[package]] +name = "oxrdfxml" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8d4bf9c5331127f01efbd1245d90fd75b7c546a97cb3e95461121ce1ad5b1c8" +dependencies = [ + "oxilangtag", + "oxiri", + "oxrdf", + "quick-xml", + "thiserror", +] + +[[package]] +name = "oxrocksdb-sys" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16430f45934d678cb6f9823e7c1bfdbdce9025f670ad85b642e46ffe5609e6ff" +dependencies = [ + "bindgen", + "cc", + "libc", +] + +[[package]] +name = "oxsdatatypes" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06fa874d87eae638daae9b4e3198864fe2cce68589f227c0b2cf5b62b1530516" +dependencies = [ + "thiserror", +] + +[[package]] +name = "oxttl" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d385f1776d7cace455ef6b7c54407838eff902ca897303d06eb12a26f4cf8a0" +dependencies = [ + "memchr", + "oxilangtag", + "oxiri", + "oxrdf", + "thiserror", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.52.6", +] + +[[package]] +name = "peg" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9928cfca101b36ec5163e70049ee5368a8a1c3c6efc9ca9c5f9cc2f816152477" +dependencies = [ + "peg-macros", + "peg-runtime", +] + +[[package]] +name = "peg-macros" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6298ab04c202fa5b5d52ba03269fb7b74550b150323038878fe6c372d8280f71" +dependencies = [ + "peg-runtime", + "proc-macro2", + "quote", +] + +[[package]] +name = "peg-runtime" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "132dca9b868d927b35b5dd728167b2dee150eb1ad686008fc71ccb298b776fca" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", +] + +[[package]] +name = "rdf5d" +version = "0.1.2" +dependencies = [ + "clap", + "memmap2", + "oxigraph", + "zstd", +] + +[[package]] +name = "redox_syscall" +version = "0.5.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23d7fd106d8c02486a8d64e778353d1cffe08ce79ac2e82f540c86d0facf6912" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b9458fa0bfeeac22b5ca447c63aaf45f28439a709ccd244698632f9aa6394d6" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "sparesults" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f478f5ead16b6136bccee7a52ea43a615f8512086708f515e26ce33e0b184036" +dependencies = [ + "json-event-parser", + "memchr", + "oxrdf", + "quick-xml", + "thiserror", +] + +[[package]] +name = "spareval" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d8ff5f1159e7416ed99160b962fa780851dddee133ef56e6b08a94023ea2c7" +dependencies = [ + "hex", + "json-event-parser", + "md-5", + "oxiri", + "oxrdf", + "oxsdatatypes", + "rand", + "regex", + "rustc-hash", + "sha1", + "sha2", + "sparesults", + "spargebra", + "sparopt", + "thiserror", +] + +[[package]] +name = "spargebra" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8907e262be4b4b363218f4688f5654d423a958aa4b8d7c7a7f898be591fa474e" +dependencies = [ + "oxilangtag", + "oxiri", + "oxrdf", + "peg", + "rand", + "thiserror", +] + +[[package]] +name = "sparopt" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1790bbdf13560c2afc245ab0f82a489003b3918e668ebd45c65fe46bfd7a1763" +dependencies = [ + "oxrdf", + "rand", + "spargebra", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "2.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3467d614147380f2e4e374161426ff399c91084acd2363eaf549172b3d5e60c0" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5e1be1c48b9172ee610da68fd9cd2770e7a4056cb3fc98710ee6906f0c7960" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasi" +version = "0.14.3+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a51ae83037bdd272a9e28ce236db8c07016dd0d50c27038b3f407533c030c95" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.3", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + +[[package]] +name = "wit-bindgen" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052283831dbae3d879dc7f51f3d92703a316ca49f91540417d38591826127814" + +[[package]] +name = "zerocopy" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.15+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/rdf5d/Cargo.toml b/rdf5d/Cargo.toml new file mode 100644 index 0000000..4dbb7a7 --- /dev/null +++ b/rdf5d/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "rdf5d" +version.workspace = true +edition = "2024" +authors = ["Gabe Fierro "] +license = "BSD-3-Clause" +repository = "https://github.com/gtfierro/r5tu" +homepage = "https://github.com/gtfierro/r5tu" +description = "Compact, mmap‑friendly on‑disk format for RDF 5‑tuples" +readme = "README.md" + +[dependencies] +clap.workspace = true +oxigraph = { workspace = true, optional = true } +zstd = { workspace = true, optional = true } +memmap2 = { workspace = true, optional = true } + +[features] +default = [] +zstd = ["dep:zstd"] +oxigraph = ["dep:oxigraph"] +mmap = ["dep:memmap2"] diff --git a/rdf5d/LICENSE b/rdf5d/LICENSE new file mode 100644 index 0000000..862e6f9 --- /dev/null +++ b/rdf5d/LICENSE @@ -0,0 +1,28 @@ +BSD 3-Clause License + +Copyright (c) 2025, Gabe Fierro + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/rdf5d/README.md b/rdf5d/README.md new file mode 100644 index 0000000..83d6f25 --- /dev/null +++ b/rdf5d/README.md @@ -0,0 +1,126 @@ +# rdf5d (R5TU) — Rust Library and CLI + +R5TU is a compact, mmap‑friendly on‑disk format for RDF 5‑tuples. This crate provides: +- A zero‑copy reader API to enumerate graphs and stream triples. +- A writer and streaming writer to build files from your data or Oxigraph graphs. +- An optional CLI (feature `oxigraph`) for quick imports and sanity checks. + +See `ARCH.md` for format details. + +## Add to your project + +In your `Cargo.toml`: + +``` +[dependencies] +rdf5d = { path = "." } +# optional +# rdf5d = { version = "0.1", features = ["oxigraph", "zstd"] } +``` + +Features: +- `zstd`: enable zstd‑compressed triple blocks. +- `oxigraph`: integrate with `oxigraph` model + parsers and build the CLI. + +## Reading an .r5tu file + +```rust +use rdf5d::R5tuFile; +use std::path::Path; + +let f = R5tuFile::open(Path::new("data.r5tu"))?; + +// Enumerate by source id +for gr in f.enumerate_by_id("src/A")? { + println!("gid={} id={} gname={} n={}", gr.gid, gr.id, gr.graphname, gr.n_triples); +} + +// Resolve a specific (id, graphname) +if let Some(gr) = f.resolve_gid("src/A", "g")? { + // Stream triples as TermIDs and render to strings on demand + for (s, p, o) in f.triples_ids(gr.gid)? { + println!("{} {} {}", + f.term_to_string(s)?, + f.term_to_string(p)?, + f.term_to_string(o)?, + ); + } +} +``` + +With feature `oxigraph`, convert to Oxigraph types: + +```rust +#[cfg(feature = "oxigraph")] +{ + let gr = f.resolve_gid("src/A","g")?.unwrap(); + let g = f.to_oxigraph_graph(gr.gid)?; // materialized Graph + for t in f.oxigraph_triples(gr.gid)? { // streaming iterator + let t = t?; /* use t: oxigraph::model::Triple */ + } +} +``` + +## Writing files + +Simple batch write: + +```rust +use rdf5d::{writer::write_file_with_options, writer::{WriterOptions}, Quint, Term}; + +let quints = vec![ + Quint{ id:"src/A".into(), gname:"g".into(), + s: Term::Iri("http://ex/s".into()), + p: Term::Iri("http://ex/p".into()), + o: Term::Literal{ lex:"v".into(), dt: None, lang: None }}, +]; + +write_file_with_options("out.r5tu", &quints, WriterOptions{ zstd: false, with_crc: true })?; +``` + +Streaming writer (append quads incrementally): + +```rust +use rdf5d::{StreamingWriter, Term, Quint}; +let mut w = StreamingWriter::new("out.r5tu", rdf5d::writer::WriterOptions{ zstd:false, with_crc:true }); +w.add(Quint{ id:"src/A".into(), gname:"g".into(), + s: Term::Iri("http://ex/s1".into()), + p: Term::Iri("http://ex/p1".into()), + o: Term::Iri("http://ex/o1".into()) })?; +w.finalize()?; +``` + +With feature `oxigraph`, write from an Oxigraph Graph: + +```rust +#[cfg(feature = "oxigraph")] +{ + use rdf5d::writer::{write_graph_from_oxigraph, WriterOptions}; + let graph = oxigraph::model::Graph::new(); + write_graph_from_oxigraph("out.r5tu", &graph, "src/A", "g", + WriterOptions{ zstd:true, with_crc:true })?; +} +``` + +## CLI (feature `oxigraph`) + +Build: `cargo build --features oxigraph --bin r5tu` + +Examples: +- Import graphs (multiple inputs) to one file: + - `r5tu build-graph --input a.ttl --input b.nt --output out.r5tu --graphname g` +- Import dataset (TriG/NQuads): + - `r5tu build-dataset --input data.trig --output out.r5tu --default-graphname default` +- Basic stats: + - `r5tu stat --file out.r5tu` + +```text +Flags: + --zstd compress triple blocks + --no-crc skip writing per-section/global CRCs +``` + +## Notes +- Reader returns empty lists for unknown ids/graphnames. +- CRCs and footer are verified during open when present. +- Sections are validated for bounds and overlap. diff --git a/rdf5d/benchmark_results.csv b/rdf5d/benchmark_results.csv new file mode 100644 index 0000000..bae08ea --- /dev/null +++ b/rdf5d/benchmark_results.csv @@ -0,0 +1,22 @@ +file,ttl_size,r5tu_size,rdflib_out_size,rdflib_load_s,rdflib_save_s,r5tu_build_s,r5tu_load_s,size_ratio_r5tu_over_ttl,size_ratio_r5tu_over_rdflib_out,save_speedup_r5tu_over_rdflib,load_speedup_r5tu_over_rdflib +/Users/gabe/src/open223/models.open223.info/models/compiled/pnnl-bdg2-1.ttl,3061209,730937,3061209,0.867950,1.032014,0.140497,0.017604,0.238774,0.238774,7.345446,49.303553 +/Users/gabe/src/open223/models.open223.info/models/compiled/lbnl-bdg4-1.ttl,820208,448202,820208,0.286288,0.190911,0.051350,0.014487,0.546449,0.546449,3.717843,19.761212 +/Users/gabe/src/open223/models.open223.info/models/compiled/HPL.ttl,51656,31690,51656,0.018676,0.018062,0.013926,0.006413,0.613481,0.613481,1.296980,2.912285 +/Users/gabe/src/open223/models.open223.info/models/compiled/nrel-example.ttl,392375,218107,392375,0.175360,0.122276,0.031120,0.010028,0.555864,0.555864,3.929158,17.487294 +/Users/gabe/src/open223/models.open223.info/models/compiled/pnnl-bdg3-2.ttl,2704420,698429,2704420,0.774090,0.950687,0.157390,0.017886,0.258255,0.258255,6.040308,43.278265 +/Users/gabe/src/open223/models.open223.info/models/compiled/guideline36-2021-A-2.ttl,15952,13406,15952,0.004873,0.003758,0.008479,0.008317,0.840396,0.840396,0.443179,0.585949 +/Users/gabe/src/open223/models.open223.info/models/compiled/NIST-IBAL.ttl,380894,197178,380894,0.137577,0.119746,0.032203,0.009386,0.517672,0.517672,3.718452,14.657618 +/Users/gabe/src/open223/models.open223.info/models/compiled/guideline36-2021-A-3.ttl,23384,17098,23384,0.006497,0.004939,0.008838,0.005787,0.731184,0.731184,0.558806,1.122712 +/Users/gabe/src/open223/models.open223.info/models/compiled/lbnl-bdg3-1.ttl,856015,389101,856015,0.396043,0.296157,0.077588,0.013207,0.454549,0.454549,3.817035,29.986819 +/Users/gabe/src/open223/models.open223.info/models/compiled/guideline36-2021-A-1.ttl,7606,8594,7606,0.003737,0.002254,0.007654,0.005791,1.129897,1.129897,0.294474,0.645274 +/Users/gabe/src/open223/models.open223.info/models/compiled/pnnl-bdg1-2.ttl,58335,25282,58335,0.018799,0.020442,0.012303,0.006971,0.433393,0.433393,1.661649,2.696740 +/Users/gabe/src/open223/models.open223.info/models/compiled/guideline36-2021-A-4.ttl,23983,17594,23983,0.008526,0.005827,0.009359,0.006392,0.733603,0.733603,0.622662,1.333839 +/Users/gabe/src/open223/models.open223.info/models/compiled/lbnl-example-radiant.ttl,90953,74252,90953,0.038713,0.032338,0.015868,0.006979,0.816378,0.816378,2.037938,5.546865 +/Users/gabe/src/open223/models.open223.info/models/compiled/guideline36-2021-A-7.ttl,14449,11947,14449,0.005946,0.003092,0.007935,0.005454,0.826839,0.826839,0.389610,1.090162 +/Users/gabe/src/open223/models.open223.info/models/compiled/design-patterns.ttl,26294,21036,26294,0.010760,0.007516,0.009707,0.005339,0.800030,0.800030,0.774209,2.015421 +/Users/gabe/src/open223/models.open223.info/models/compiled/guideline36-2021-A-8.ttl,14112,11634,14112,0.005360,0.003020,0.008133,0.006872,0.824405,0.824405,0.371251,0.779964 +/Users/gabe/src/open223/models.open223.info/models/compiled/guideline36-2021-A-9.ttl,90623,54118,90623,0.031241,0.025210,0.012836,0.008701,0.597177,0.597177,1.964008,3.590590 +/Users/gabe/src/open223/models.open223.info/models/compiled/NIST-HPL.ttl,51656,31695,51656,0.020620,0.018813,0.011034,0.007371,0.613578,0.613578,1.704963,2.797538 +/Users/gabe/src/open223/models.open223.info/models/compiled/IBAL.ttl,380894,197173,380894,0.162082,0.117873,0.030142,0.009069,0.517658,0.517658,3.910563,17.872029 +/Users/gabe/src/open223/models.open223.info/models/compiled/nist-bdg1-1.ttl,120048,79133,120048,0.044391,0.039727,0.013707,0.007696,0.659178,0.659178,2.898311,5.767860 +/Users/gabe/src/open223/models.open223.info/models/compiled/scb-vrf.ttl,401387,160857,401387,0.153370,0.131198,0.035648,0.011592,0.400753,0.400753,3.680396,13.230703 diff --git a/rdf5d/scripts/benchmark_ttl_dir.py b/rdf5d/scripts/benchmark_ttl_dir.py new file mode 100755 index 0000000..55f5d38 --- /dev/null +++ b/rdf5d/scripts/benchmark_ttl_dir.py @@ -0,0 +1,486 @@ +#!/usr/bin/env python3 +""" +Benchmark .ttl files vs rdf5d. + +For each .ttl file in a directory, measures: + - Size: original TTL vs generated .r5tu + - Time: rdflib load/serialize vs r5tu build/stat (load) + +Requires: + - Python: rdflib (pip install rdflib) + - r5tu CLI built with --features oxigraph (built automatically if not found) + +Usage: + python scripts/benchmark_ttl_dir.py DIR [-o results.csv] [--recursive] + [--keep-artifacts] + [--r5tu PATH] [--enable-mmap] + +Notes: + - The script tries to use target/release/r5tu if present; otherwise it will + attempt to build it via `cargo build --release --features oxigraph[,mmap]`. + - Load time for rdf5d is measured by timing `r5tu stat --file `. + - rdflib save time is for Graph.serialize() to Turtle. +""" + +from __future__ import annotations + +import argparse +import csv +import gc +import os +import shlex +import subprocess +import sys +import tempfile +import time +from pathlib import Path +import statistics as stats +from typing import Iterable, List, Optional, Tuple + + +def _r5tu_has_oxigraph_cli(r5tu_path: Path) -> bool: + """Detect if the r5tu binary is built with the oxigraph CLI. + + The non-oxigraph build prints a message like: + "r5tu CLI requires the 'oxigraph' feature. Try: ..." + and does not expose clap subcommands. + """ + try: + proc = subprocess.run( + [r5tu_path.as_posix(), "--help"], capture_output=True, text=True + ) + except Exception: + return False + out = (proc.stdout or "") + (proc.stderr or "") + if "requires the 'oxigraph' feature" in out: + return False + # Heuristic: help should mention subcommands like build-graph/stat + return ("build-graph" in out) and ("stat" in out) + + +def find_or_build_r5tu(explicit_path: Optional[Path], enable_mmap: bool) -> Path: + """Find existing r5tu binary or build it. + + Returns path to the binary, raising RuntimeError on failure. + """ + if explicit_path: + p = explicit_path.resolve() + if not p.exists(): + raise RuntimeError(f"r5tu not found at {p}") + if not _r5tu_has_oxigraph_cli(p): + raise RuntimeError( + f"r5tu at {p} does not expose the oxigraph-based CLI (build with --features oxigraph)" + ) + if not _r5tu_supports_zstd(p): + raise RuntimeError( + f"r5tu at {p} does not support --zstd (build with --features zstd)." + ) + return p + + # Prefer release build if present + candidate = Path("target/release/r5tu") + if candidate.exists() and _r5tu_has_oxigraph_cli(candidate) and _r5tu_supports_zstd(candidate): + return candidate.resolve() + + # Try debug build + candidate = Path("target/debug/r5tu") + if candidate.exists() and _r5tu_has_oxigraph_cli(candidate) and _r5tu_supports_zstd(candidate): + return candidate.resolve() + + # Build release with required features + features = ["oxigraph", "zstd"] + if enable_mmap: + features.append("mmap") + cmd = [ + "cargo", + "build", + "--release", + "--features", + ",".join(features), + ] + print("Building r5tu:", " ".join(shlex.quote(x) for x in cmd)) + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError( + "Failed to build r5tu. Ensure Rust toolchain is installed and network access is available for crates." + ) from e + + candidate = Path("target/release/r5tu") + if not candidate.exists() or not _r5tu_has_oxigraph_cli(candidate) or not _r5tu_supports_zstd(candidate): + raise RuntimeError("r5tu binary not found after build") + return candidate.resolve() + + +def _r5tu_supports_zstd(r5tu_path: Path) -> bool: + """Try a tiny build-graph with --zstd to verify feature is enabled.""" + try: + with tempfile.TemporaryDirectory(prefix="r5tu_zstd_check_") as tdir: + tdirp = Path(tdir) + ttl = tdirp / "t.ttl" + out = tdirp / "t.r5tu" + ttl.write_text(""" +@prefix ex: . +ex:s ex:p ex:o . +""".strip()) + cmd = [ + r5tu_path.as_posix(), + "build-graph", + "--input", + ttl.as_posix(), + "--output", + out.as_posix(), + "--format", + "turtle", + "--zstd", + ] + proc = subprocess.run(cmd, capture_output=True, text=True) + if proc.returncode != 0: + combined = (proc.stdout or "") + (proc.stderr or "") + return "zstd feature not enabled" not in combined + return out.exists() and out.stat().st_size > 0 + except Exception: + return False + + +def iter_ttl_files(root: Path, recursive: bool) -> Iterable[Path]: + exts = {".ttl", ".turtle"} + if recursive: + for p in root.rglob("*"): + if p.is_file() and p.suffix.lower() in exts: + yield p + else: + for p in root.iterdir(): + if p.is_file() and p.suffix.lower() in exts: + yield p + + +def measure_rdflib(ttl_path: Path, out_dir: Path, trials: int = 5) -> Tuple[List[float], List[float], int]: + try: + from rdflib import Graph + except Exception as e: # pragma: no cover + raise RuntimeError( + "rdflib is required. Install with: pip install rdflib" + ) from e + + load_times: List[float] = [] + save_times: List[float] = [] + out_ttl_size = 0 + + for i in range(trials): + # Load + g = Graph() + t0 = time.perf_counter() + g.parse(ttl_path.as_posix(), format="turtle") + load_times.append(time.perf_counter() - t0) + + # Serialize + gc.collect() + out_ttl = out_dir / f"{ttl_path.stem}.rdflib.t{i}.out.ttl" + t1 = time.perf_counter() + g.serialize(destination=out_ttl.as_posix(), format="turtle") + save_times.append(time.perf_counter() - t1) + if out_ttl.exists(): + out_ttl_size = out_ttl.stat().st_size + + # Free graph for next iteration + del g + gc.collect() + + return load_times, save_times, out_ttl_size + + +def run_timed(cmd: List[str]) -> Tuple[float, int, str]: + """Run command, return (seconds, exit_code, stderr+stdout).""" + t0 = time.perf_counter() + proc = subprocess.run(cmd, capture_output=True, text=True) + dt = time.perf_counter() - t0 + out = (proc.stdout or "") + (proc.stderr or "") + return dt, proc.returncode, out + + +def measure_r5tu( + r5tu: Path, ttl_path: Path, out_dir: Path, trials: int = 5 +) -> Tuple[List[float], List[float], int, Optional[str]]: + # Build .r5tu + build_times: List[float] = [] + stat_times: List[float] = [] + r5_size: int = 0 + last_stat_err: Optional[str] = None + + for i in range(trials): + out_r5 = out_dir / f"{ttl_path.stem}.trial{i}.r5tu" + if out_r5.exists(): + try: + out_r5.unlink() + except Exception: + pass + build_cmd = [ + r5tu.as_posix(), + "build-graph", + "--input", + ttl_path.as_posix(), + "--output", + out_r5.as_posix(), + "--format", + "turtle", + "--zstd", + ] + build_s, code, build_out = run_timed(build_cmd) + if code != 0: + raise RuntimeError( + f"r5tu build-graph failed (code={code}) for {ttl_path}:\n{build_out}" + ) + if not out_r5.exists() or out_r5.stat().st_size == 0: + raise RuntimeError( + f"r5tu did not produce a valid file: {out_r5} (check that r5tu supports oxigraph CLI)" + ) + build_times.append(build_s) + r5_size = out_r5.stat().st_size + + # Stat (load) time + stat_cmd = [r5tu.as_posix(), "stat", "--file", out_r5.as_posix()] + stat_s, code, stat_out = run_timed(stat_cmd) + if code != 0: + last_stat_err = stat_out + stat_times.append(stat_s) + + return build_times, stat_times, r5_size, last_stat_err + + +def main() -> int: + ap = argparse.ArgumentParser(description="Benchmark Turtle vs rdf5d") + ap.add_argument("dir", type=Path, help="Directory containing .ttl/.turtle files") + ap.add_argument( + "-o", + "--output", + type=Path, + default=Path("benchmark_results.csv"), + help="CSV output file", + ) + ap.add_argument( + "--recursive", + action="store_true", + help="Recurse into subdirectories", + ) + ap.add_argument( + "--keep-artifacts", + action="store_true", + help="Keep generated .r5tu and rdflib .ttl outputs", + ) + ap.add_argument( + "--r5tu", + type=Path, + default=None, + help="Path to r5tu binary (if not provided, attempts to find/build)", + ) + ap.add_argument( + "--enable-mmap", + action="store_true", + help="Build r5tu with mmap feature for mmap-based loading", + ) + args = ap.parse_args() + + root = args.dir + if not root.exists() or not root.is_dir(): + print(f"Error: '{root}' is not a directory", file=sys.stderr) + return 2 + + try: + r5tu_bin = find_or_build_r5tu(args.r5tu, args.enable_mmap) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + return 2 + + ttl_files = list(iter_ttl_files(root, args.recursive)) + if not ttl_files: + print("No .ttl/.turtle files found", file=sys.stderr) + return 1 + + # Work dir for outputs + if args.keep_artifacts: + out_root = Path("bench_artifacts") + out_root.mkdir(parents=True, exist_ok=True) + temp_mgr = None + else: + temp_mgr = tempfile.TemporaryDirectory(prefix="rdf5d_bench_") + out_root = Path(temp_mgr.name) + + print(f"Using r5tu: {r5tu_bin}") + print(f"Artifacts dir: {out_root}") + results: List[dict] = [] + + for i, ttl in enumerate(ttl_files, 1): + try: + print(f"[{i}/{len(ttl_files)}] {ttl}") + ttl_size = ttl.stat().st_size + file_out_dir = out_root / ttl.stem + file_out_dir.mkdir(parents=True, exist_ok=True) + + # rdflib (repeat trials) + rdflib_load_list, rdflib_save_list, rdflib_out_size = measure_rdflib(ttl, file_out_dir, trials=5) + + # rdf5d via r5tu (repeat trials) + r5_build_list, r5_stat_list, r5_size, stat_err = measure_r5tu(r5tu_bin, ttl, file_out_dir, trials=5) + + # Compute means and stddevs + rdflib_load_mean = stats.mean(rdflib_load_list) + rdflib_load_std = stats.stdev(rdflib_load_list) if len(rdflib_load_list) > 1 else 0.0 + rdflib_save_mean = stats.mean(rdflib_save_list) + rdflib_save_std = stats.stdev(rdflib_save_list) if len(rdflib_save_list) > 1 else 0.0 + r5_build_mean = stats.mean(r5_build_list) + r5_build_std = stats.stdev(r5_build_list) if len(r5_build_list) > 1 else 0.0 + r5_stat_mean = stats.mean(r5_stat_list) + r5_stat_std = stats.stdev(r5_stat_list) if len(r5_stat_list) > 1 else 0.0 + + rdflib_total_list = [a + b for a, b in zip(rdflib_load_list, rdflib_save_list)] + r5_total_list = [a + b for a, b in zip(r5_stat_list, r5_build_list)] + rdflib_total_mean = stats.mean(rdflib_total_list) + rdflib_total_std = stats.stdev(rdflib_total_list) if len(rdflib_total_list) > 1 else 0.0 + r5_total_mean = stats.mean(r5_total_list) + r5_total_std = stats.stdev(r5_total_list) if len(r5_total_list) > 1 else 0.0 + + save_speedup = (rdflib_save_mean / r5_build_mean) if r5_build_mean > 0 else 0.0 + load_speedup = (rdflib_load_mean / r5_stat_mean) if r5_stat_mean > 0 else 0.0 + total_speedup = (rdflib_total_mean / r5_total_mean) if r5_total_mean > 0 else 0.0 + row = { + "file": ttl.as_posix(), + "ttl_size": ttl_size, + "rdflib_load_mean": rdflib_load_mean, + "rdflib_load_std": rdflib_load_std, + "rdflib_save_mean": rdflib_save_mean, + "rdflib_save_std": rdflib_save_std, + "rdflib_out_size": rdflib_out_size, + "r5tu_build_mean": r5_build_mean, + "r5tu_build_std": r5_build_std, + "r5tu_load_mean": r5_stat_mean, + "r5tu_load_std": r5_stat_std, + "rdflib_total_mean": rdflib_total_mean, + "rdflib_total_std": rdflib_total_std, + "r5tu_total_mean": r5_total_mean, + "r5tu_total_std": r5_total_std, + "r5tu_size": r5_size, + "size_ratio_r5tu_over_ttl": (r5_size / ttl_size) if ttl_size else 0.0, + "size_ratio_r5tu_over_rdflib_out": (r5_size / rdflib_out_size) if rdflib_out_size else 0.0, + "save_speedup_r5tu_over_rdflib": save_speedup, + "load_speedup_r5tu_over_rdflib": load_speedup, + "total_speedup_r5tu_over_rdflib": total_speedup, + } + if stat_err: + row["r5tu_stat_error"] = stat_err.strip().splitlines()[-1][:200] + results.append(row) + except Exception as e: + results.append({ + "file": ttl.as_posix(), + "error": str(e), + }) + + # Write CSV + fieldnames = [ + "file", + "ttl_size", + "r5tu_size", + "rdflib_out_size", + "rdflib_load_mean", + "rdflib_load_std", + "rdflib_save_mean", + "rdflib_save_std", + "r5tu_build_mean", + "r5tu_build_std", + "r5tu_load_mean", + "r5tu_load_std", + "rdflib_total_mean", + "rdflib_total_std", + "r5tu_total_mean", + "r5tu_total_std", + "size_ratio_r5tu_over_ttl", + "size_ratio_r5tu_over_rdflib_out", + "save_speedup_r5tu_over_rdflib", + "load_speedup_r5tu_over_rdflib", + "total_speedup_r5tu_over_rdflib", + "r5tu_stat_error", + "error", + ] + # Only include columns present + present = [f for f in fieldnames if any(f in r for r in results)] + with args.output.open("w", newline="") as f: + w = csv.DictWriter(f, fieldnames=present) + w.writeheader() + for r in results: + # Format floats to 6 decimals for readability + row_out = {} + for k in present: + v = r.get(k, "") + if isinstance(v, float): + row_out[k] = f"{v:.6f}" + else: + row_out[k] = v + w.writerow(row_out) + + print(f"Wrote results to {args.output}") + + # Summary + ok_rows = [r for r in results if "error" not in r] + n_ok = len(ok_rows) + n_err = len(results) - n_ok + if n_ok: + total_ttl = sum(r.get("ttl_size", 0) for r in ok_rows) + total_r5 = sum(r.get("r5tu_size", 0) for r in ok_rows) + total_rdflib_out = sum(r.get("rdflib_out_size", 0) for r in ok_rows) + + mean_ratio_vs_ttl = sum(r.get("size_ratio_r5tu_over_ttl", 0.0) for r in ok_rows) / n_ok + mean_ratio_vs_rdflib = ( + sum(r.get("size_ratio_r5tu_over_rdflib_out", 0.0) for r in ok_rows) / n_ok + ) if total_rdflib_out else 0.0 + + mean_rdflib_total = sum(r.get("rdflib_total_mean", 0.0) for r in ok_rows) / n_ok + mean_r5_total = sum(r.get("r5tu_total_mean", 0.0) for r in ok_rows) / n_ok + mean_total_speedup = ( + sum(r.get("total_speedup_r5tu_over_rdflib", 0.0) for r in ok_rows) / n_ok + ) + + mean_rdflib_load = sum(r.get("rdflib_load_mean", 0.0) for r in ok_rows) / n_ok + mean_r5_load = sum(r.get("r5tu_load_mean", 0.0) for r in ok_rows) / n_ok + mean_load_speedup = ( + sum(r.get("load_speedup_r5tu_over_rdflib", 0.0) for r in ok_rows) / n_ok + ) + + mean_rdflib_save = sum(r.get("rdflib_save_mean", 0.0) for r in ok_rows) / n_ok + mean_r5_save = sum(r.get("r5tu_build_mean", 0.0) for r in ok_rows) / n_ok + mean_save_speedup = ( + sum(r.get("save_speedup_r5tu_over_rdflib", 0.0) for r in ok_rows) / n_ok + ) + + pct_saved_vs_ttl = (1 - (total_r5 / total_ttl)) * 100 if total_ttl else 0.0 + pct_saved_vs_rdflib = ( + (1 - (total_r5 / total_rdflib_out)) * 100 if total_rdflib_out else 0.0 + ) + + print("\nSummary (rdf5d vs rdflib):") + print(f" Files processed: {n_ok} ok, {n_err} errors") + print(f" Size total (TTL → R5TU): {total_ttl} → {total_r5} bytes ({pct_saved_vs_ttl:.2f}% saved)") + if total_rdflib_out: + print( + f" Size total (rdflib TTL → R5TU): {total_rdflib_out} → {total_r5} bytes ({pct_saved_vs_rdflib:.2f}% saved)" + ) + print(f" Mean size ratio R5TU/TTL: {mean_ratio_vs_ttl:.3f}") + if total_rdflib_out: + print(f" Mean size ratio R5TU/rdflib TTL: {mean_ratio_vs_rdflib:.3f}") + print( + f" Load+Save total mean (rdflib vs r5tu): {mean_rdflib_total:.3f}s vs {mean_r5_total:.3f}s (speedup {mean_total_speedup:.2f}×)" + ) + print( + f" Load mean (rdflib vs r5tu): {mean_rdflib_load:.3f}s vs {mean_r5_load:.3f}s (speedup {mean_load_speedup:.2f}×)" + ) + print( + f" Save mean (rdflib vs r5tu): {mean_rdflib_save:.3f}s vs {mean_r5_save:.3f}s (speedup {mean_save_speedup:.2f}×)" + ) + else: + print("\nSummary: no successful runs to summarize") + if not args.keep_artifacts and temp_mgr is not None: + temp_mgr.cleanup() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/rdf5d/src/bin/r5tu.rs b/rdf5d/src/bin/r5tu.rs new file mode 100644 index 0000000..93e0315 --- /dev/null +++ b/rdf5d/src/bin/r5tu.rs @@ -0,0 +1,371 @@ +use clap::{Args, Parser, Subcommand, ValueEnum}; +#[cfg(feature = "oxigraph")] +use oxigraph::io::RdfFormat; +#[cfg(feature = "oxigraph")] +use oxigraph::model::GraphNameRef; +#[cfg(feature = "oxigraph")] +use oxigraph::store::Store; +#[cfg(feature = "oxigraph")] +use std::fs::File; +#[cfg(feature = "oxigraph")] +use std::io::BufReader; +use std::path::PathBuf; +#[cfg(feature = "oxigraph")] +use std::time::Instant; + +#[cfg(feature = "oxigraph")] +use rdf5d::writer::WriterOptions; +#[cfg(feature = "oxigraph")] +use rdf5d::{Quint, R5tuFile, StreamingWriter, Term}; + +#[derive(Clone, Copy, ValueEnum)] +enum GraphFmt { + Turtle, + Ntriples, + Rdfxml, +} +#[derive(Clone, Copy, ValueEnum)] +enum DatasetFmt { + Trig, + Nquads, +} + +#[derive(Parser)] +#[command(name = "r5tu", version, about = "R5TU builder/stat CLI")] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + BuildGraph(BuildGraphArgs), + BuildDataset(BuildDatasetArgs), + Stat(StatArgs), +} + +#[derive(Args)] +struct BuildGraphArgs { + #[arg(long = "input", required = true)] + input: Vec, + #[arg(long = "output")] + output: PathBuf, + #[arg(long = "format", value_enum)] + format: Option, + #[arg(long = "id")] + id: Option, + #[arg(long = "graphname")] + graphname: Option, + #[arg(long = "zstd", default_value_t = false)] + zstd: bool, + #[arg(long = "no-crc", default_value_t = false)] + no_crc: bool, +} + +#[derive(Args)] +struct BuildDatasetArgs { + #[arg(long = "input", required = true)] + input: Vec, + #[arg(long = "output")] + output: PathBuf, + #[arg(long = "format", value_enum)] + format: Option, + #[arg(long = "id")] + id: Option, + #[arg(long = "default-graphname")] + default_graphname: Option, + #[arg(long = "zstd", default_value_t = false)] + zstd: bool, + #[arg(long = "no-crc", default_value_t = false)] + no_crc: bool, +} + +#[derive(Args)] +struct StatArgs { + #[arg(long = "file")] + file: PathBuf, + #[arg(long = "verbose", default_value_t = false)] + verbose: bool, + #[arg(long = "graphname")] + graphname: Option, + #[arg(long = "list", default_value_t = false)] + list: bool, + #[cfg(feature = "mmap")] + #[arg( + long = "no-mmap", + default_value_t = false, + help = "Disable mmap and read into memory" + )] + no_mmap: bool, +} + +#[cfg(feature = "oxigraph")] +fn infer_graph_rdf_format(ext: &str) -> Option { + match ext.to_ascii_lowercase().as_str() { + "nt" | "ntriples" => Some(RdfFormat::NTriples), + "ttl" | "turtle" => Some(RdfFormat::Turtle), + "rdf" | "xml" | "rdfxml" => Some(RdfFormat::RdfXml), + _ => None, + } +} +#[cfg(feature = "oxigraph")] +fn infer_dataset_rdf_format(ext: &str) -> Option { + match ext.to_ascii_lowercase().as_str() { + "nq" | "nquads" => Some(RdfFormat::NQuads), + "trig" => Some(RdfFormat::TriG), + _ => None, + } +} + +#[cfg(feature = "oxigraph")] +fn main() -> Result<(), Box> { + let cli = Cli::parse(); + match cli.command { + Commands::BuildGraph(args) => { + let opts = WriterOptions { + zstd: args.zstd, + with_crc: !args.no_crc, + }; + let mut w = StreamingWriter::new(&args.output, opts); + let start = Instant::now(); + for input in args.input { + let f = File::open(&input)?; + let mut rdr = BufReader::new(f); + let rfmt: RdfFormat = match args.format { + Some(GraphFmt::Turtle) => RdfFormat::Turtle, + Some(GraphFmt::Ntriples) => RdfFormat::NTriples, + Some(GraphFmt::Rdfxml) => RdfFormat::RdfXml, + None => infer_graph_rdf_format( + input.extension().and_then(|e| e.to_str()).unwrap_or(""), + ) + .unwrap_or(RdfFormat::Turtle), + }; + // Load into store via BulkLoader (explicit fast path) + let store = Store::new()?; + let mut loader = store.bulk_loader(); + loader.load_from_reader(rfmt, &mut rdr)?; + loader.commit()?; + let gname_auto = + rdf5d::writer::detect_graphname_from_store(&store).unwrap_or_else(|| { + args.graphname + .clone() + .unwrap_or_else(|| "default".to_string()) + }); + let id = args + .id + .clone() + .unwrap_or_else(|| input.to_string_lossy().to_string()); + // Stream loaded triples from default graph into our writer + let mut n = 0usize; + for q in store.quads_for_pattern(None, None, None, Some(GraphNameRef::DefaultGraph)) + { + let q = q?; + n += 1; + let s = match q.subject { + oxigraph::model::NamedOrBlankNode::NamedNode(nm) => { + Term::Iri(nm.as_str().to_string()) + } + oxigraph::model::NamedOrBlankNode::BlankNode(b) => { + Term::BNode(format!("_:{}", b.as_str())) + } + }; + let p = Term::Iri(q.predicate.as_str().to_string()); + let o = match q.object { + oxigraph::model::Term::NamedNode(nm) => Term::Iri(nm.as_str().to_string()), + oxigraph::model::Term::BlankNode(b) => { + Term::BNode(format!("_:{}", b.as_str())) + } + oxigraph::model::Term::Literal(l) => { + let lex = l.value().to_string(); + if let Some(lang) = l.language() { + Term::Literal { + lex, + dt: None, + lang: Some(lang.to_string()), + } + } else { + Term::Literal { + lex, + dt: Some(l.datatype().as_str().to_string()), + lang: None, + } + } + } + }; + w.add(Quint { + id: id.clone(), + s, + p, + o, + gname: gname_auto.clone(), + })?; + } + println!( + "Added graph id='{}' graphname='{}' ({} triples) from '{}'", + id, + gname_auto, + n, + input.display() + ); + } + w.finalize()?; + eprintln!("built in {:?}", start.elapsed()); + } + Commands::BuildDataset(args) => { + let default_g = args + .default_graphname + .clone() + .unwrap_or_else(|| "default".to_string()); + let opts = WriterOptions { + zstd: args.zstd, + with_crc: !args.no_crc, + }; + let mut w = StreamingWriter::new(&args.output, opts); + let start = Instant::now(); + for input in args.input { + let f = File::open(&input)?; + let mut rdr = BufReader::new(f); + let rfmt: RdfFormat = match args.format { + Some(DatasetFmt::Trig) => RdfFormat::TriG, + Some(DatasetFmt::Nquads) => RdfFormat::NQuads, + None => infer_dataset_rdf_format( + input.extension().and_then(|e| e.to_str()).unwrap_or(""), + ) + .unwrap_or(RdfFormat::NQuads), + }; + let store = Store::new()?; + store.load_from_reader(rfmt, &mut rdr)?; + let id = args + .id + .clone() + .unwrap_or_else(|| input.to_string_lossy().to_string()); + let mut n = 0usize; + for q in store.quads_for_pattern(None, None, None, None) { + let q = q?; + n += 1; + let s = match q.subject { + oxigraph::model::NamedOrBlankNode::NamedNode(nm) => { + Term::Iri(nm.as_str().to_string()) + } + oxigraph::model::NamedOrBlankNode::BlankNode(b) => { + Term::BNode(format!("_:{}", b.as_str())) + } + }; + let p = Term::Iri(q.predicate.as_str().to_string()); + let o = match q.object { + oxigraph::model::Term::NamedNode(nm) => Term::Iri(nm.as_str().to_string()), + oxigraph::model::Term::BlankNode(b) => { + Term::BNode(format!("_:{}", b.as_str())) + } + oxigraph::model::Term::Literal(l) => { + let lex = l.value().to_string(); + if let Some(lang) = l.language() { + Term::Literal { + lex, + dt: None, + lang: Some(lang.to_string()), + } + } else { + Term::Literal { + lex, + dt: Some(l.datatype().as_str().to_string()), + lang: None, + } + } + } + }; + let gname = match q.graph_name { + oxigraph::model::GraphName::DefaultGraph => default_g.clone(), + oxigraph::model::GraphName::NamedNode(nm) => nm.as_str().to_string(), + oxigraph::model::GraphName::BlankNode(b) => format!("_:{}", b.as_str()), + }; + w.add(Quint { + id: id.clone(), + s, + p, + o, + gname, + })?; + } + println!( + "Added dataset id='{}' quads={} from '{}'", + id, + n, + input.display() + ); + } + w.finalize()?; + eprintln!("built in {:?}", start.elapsed()); + } + Commands::Stat(args) => { + let file = args.file; + let f = match { R5tuFile::open(&file) } { + Ok(f) => f, + Err(e) => { + eprintln!( + "stat: failed to open '{}': {}\nHint: Use 'build-graph' or 'build-dataset' to produce an .r5tu file first.", + file.display(), + e + ); + std::process::exit(2); + } + }; + let verbose = args.verbose; + let list = args.list; + let filter_g = args.graphname; + let toc = f.toc(); + eprintln!("sections: {}", toc.len()); + if verbose { + let h = f.header(); + eprintln!( + "header.magic='{}' version={} flags=0x{:04x} created_unix={} toc_off={} toc_len={}", + std::str::from_utf8(&h.magic).unwrap_or("????"), + h.version_u16, + h.flags_u16, + h.created_unix64, + h.toc_off_u64, + h.toc_len_u32 + ); + for (i, e) in toc.iter().enumerate() { + eprintln!( + " [{}] kind={:?} off={} len={} crc={}", + i, e.kind, e.section.off, e.section.len, e.crc32_u32 + ); + } + } + let start = Instant::now(); + let mut n_triples = 0u64; + let graphs = if let Some(ref gname) = filter_g { + f.enumerate_by_graphname(gname)? + } else { + f.enumerate_all()? + }; + let n_graphs = graphs.len() as u64; + for gr in &graphs { + n_triples += gr.n_triples; + } + eprintln!( + "graphs: {} triples: {} in {:?}", + n_graphs, + n_triples, + start.elapsed() + ); + if list { + for gr in graphs { + println!( + "gid={} id='{}' graphname='{}' n_triples={}", + gr.gid, gr.id, gr.graphname, gr.n_triples + ); + } + } + } + } + Ok(()) +} + +#[cfg(not(feature = "oxigraph"))] +fn main() { + eprintln!( + "r5tu CLI requires the 'oxigraph' feature. Try: cargo run --features oxigraph --bin r5tu -- help" + ); +} diff --git a/rdf5d/src/header.rs b/rdf5d/src/header.rs new file mode 100644 index 0000000..e7e497e --- /dev/null +++ b/rdf5d/src/header.rs @@ -0,0 +1,181 @@ +//! Header, TOC, and section kinds for R5TU files (ARCH.md §1). + +/// Enumerates the kinds of sections in an R5TU file. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(u16)] +pub enum SectionKind { + TermDict = 1, + IdDict = 2, + GNameDict = 3, + GDir = 4, + IdxId2Gid = 5, + IdxGName2Gid = 6, + IdxPair2Gid = 7, + TripleBlocks = 8, +} + +impl SectionKind { + /// Convert a little‑endian `u16` value into a kind, if recognized. + pub fn from_u16(v: u16) -> Option { + use SectionKind::*; + Some(match v { + 1 => TermDict, + 2 => IdDict, + 3 => GNameDict, + 4 => GDir, + 5 => IdxId2Gid, + 6 => IdxGName2Gid, + 7 => IdxPair2Gid, + 8 => TripleBlocks, + _ => return None, + }) + } +} + +/// Byte span for a section. +#[derive(Debug, Clone, Copy)] +pub struct Section { + pub off: u64, + pub len: u64, +} + +/// Entry in the table of contents mapping a kind to its section. +#[derive(Debug, Clone, Copy)] +pub struct TocEntry { + pub kind: SectionKind, + pub section: Section, + pub crc32_u32: u32, // 0 if absent +} + +/// Parsed fixed‑size file header. +#[derive(Debug, Clone, Copy)] +pub struct Header { + pub magic: [u8; 4], + pub version_u16: u16, + pub flags_u16: u16, + pub created_unix64: u64, + pub toc_off_u64: u64, + pub toc_len_u32: u32, + pub reserved_u32: u32, +} + +impl Header { + /// Parse a header from the first 32 bytes of `buf`. + pub fn parse(buf: &[u8]) -> Option { + if buf.len() < 32 { + return None; + } + let mut magic = [0u8; 4]; + magic.copy_from_slice(&buf[0..4]); + let version_u16 = u16::from_le_bytes([buf[4], buf[5]]); + let flags_u16 = u16::from_le_bytes([buf[6], buf[7]]); + let created_unix64 = u64::from_le_bytes([ + buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14], buf[15], + ]); + let toc_off_u64 = u64::from_le_bytes([ + buf[16], buf[17], buf[18], buf[19], buf[20], buf[21], buf[22], buf[23], + ]); + let toc_len_u32 = u32::from_le_bytes([buf[24], buf[25], buf[26], buf[27]]); + let reserved_u32 = u32::from_le_bytes([buf[28], buf[29], buf[30], buf[31]]); + Some(Header { + magic, + version_u16, + flags_u16, + created_unix64, + toc_off_u64, + toc_len_u32, + reserved_u32, + }) + } +} + +/// Parse the TOC entries referenced by `hdr`. +pub fn parse_toc(buf: &[u8], hdr: &Header) -> Option> { + // Each entry is 32 bytes; TOC starts at hdr.toc_off_u64 + let toc_off = hdr.toc_off_u64 as usize; + let n = hdr.toc_len_u32 as usize; + let need = toc_off.checked_add(n.checked_mul(32)?)?; + if need > buf.len() { + return None; + } + + let mut out = Vec::with_capacity(n); + for i in 0..n { + let off = toc_off + i * 32; + let kind_u16 = u16::from_le_bytes([buf[off], buf[off + 1]]); + let kind = SectionKind::from_u16(kind_u16)?; + // skip reserved_u16 + let off_u64 = u64::from_le_bytes([ + buf[off + 4], + buf[off + 5], + buf[off + 6], + buf[off + 7], + buf[off + 8], + buf[off + 9], + buf[off + 10], + buf[off + 11], + ]); + let len_u64 = u64::from_le_bytes([ + buf[off + 12], + buf[off + 13], + buf[off + 14], + buf[off + 15], + buf[off + 16], + buf[off + 17], + buf[off + 18], + buf[off + 19], + ]); + + // crc32_u32 at off+20..24 (optional), reserved_u32 off+24..28 (ignored here) + let crc32_u32 = + u32::from_le_bytes([buf[off + 20], buf[off + 21], buf[off + 22], buf[off + 23]]); + out.push(TocEntry { + kind, + section: Section { + off: off_u64, + len: len_u64, + }, + crc32_u32, + }); + } + Some(out) +} + +/// True if `section` lies entirely within a buffer of `buf_len` bytes. +pub fn section_in_bounds(buf_len: usize, section: Section) -> bool { + let start = section.off as usize; + let len = section.len as usize; + start <= buf_len && start.saturating_add(len) <= buf_len +} + +/// Compute IEEE CRC‑32. +pub fn crc32_ieee(data: &[u8]) -> u32 { + let mut crc: u32 = 0xFFFF_FFFF; + for &b in data { + let mut x = (crc ^ (b as u32)) & 0xFF; + for _ in 0..8 { + let lsb = x & 1; + x >>= 1; + if lsb != 0 { + x ^= 0xEDB88320; + } + } + crc = (crc >> 8) ^ x; + } + crc ^ 0xFFFF_FFFF +} + +/// Parse the optional 16‑byte footer containing the global CRC and magic. +pub fn parse_footer(buf: &[u8]) -> Option<(u32, [u8; 12])> { + if buf.len() < 16 { + return None; + } + let base = buf.len() - 16; + let mut magic = [0u8; 12]; + magic.copy_from_slice(&buf[base + 4..base + 16]); + if &magic != b"R5TU_ENDMARK" { + return None; + } + let crc = u32::from_le_bytes([buf[base], buf[base + 1], buf[base + 2], buf[base + 3]]); + Some((crc, magic)) +} diff --git a/rdf5d/src/lib.rs b/rdf5d/src/lib.rs new file mode 100644 index 0000000..9972162 --- /dev/null +++ b/rdf5d/src/lib.rs @@ -0,0 +1,55 @@ +//! rdf5d — Compact, mmap‑friendly storage for RDF 5‑tuples (R5TU). +//! +//! This crate provides a tiny reader and writer for the on‑disk format +//! described in ARCH.md. It focuses on fast, bounded reads suitable for +//! memory‑mapped access and simple, deterministic file production. +//! +//! Quick start: write a file +//! +//! ```no_run +//! use rdf5d::{write_file, Quint, Term}; +//! use std::path::PathBuf; +//! +//! let path = PathBuf::from("example.r5tu"); +//! let quads = vec![ +//! Quint { +//! id: "dataset:1".into(), +//! s: Term::Iri("http://example.org/Alice".into()), +//! p: Term::Iri("http://xmlns.com/foaf/0.1/name".into()), +//! o: Term::Literal { lex: "Alice".into(), dt: None, lang: None }, +//! gname: "http://example.org/graph".into(), +//! }, +//! ]; +//! +//! write_file(&path, &quads).expect("write ok"); +//! ``` +//! +//! Read it back and enumerate graph groups +//! +//! ```no_run +//! use rdf5d::R5tuFile; +//! use std::path::Path; +//! +//! let f = R5tuFile::open(Path::new("example.r5tu")).expect("open"); +//! // List groups by a dataset id +//! let hits = f.enumerate_by_id("dataset:1").expect("lookup"); +//! for g in hits { +//! println!("gid={} graph={} triples={} id={}", g.gid, g.graphname, g.n_triples, g.id); +//! } +//! ``` +//! +//! See `ARCH.md` for details on the layout and terminology. + +pub mod header; +pub mod reader; +pub mod update; +pub mod writer; + +pub use reader::{GraphRef, R5tuFile}; +pub use update::{replace_graph, replace_graph_with_options}; +pub use writer::{ + Quint, StreamingWriter, Term, WriterOptions, write_file, write_file_with_options, +}; + +/// Crate‑level result type using the reader error. +pub type Result = std::result::Result; diff --git a/rdf5d/src/reader.rs b/rdf5d/src/reader.rs new file mode 100644 index 0000000..e98be16 --- /dev/null +++ b/rdf5d/src/reader.rs @@ -0,0 +1,1654 @@ +//! Reader for R5TU files: open, inspect sections, and iterate triples. +//! +//! The primary entry point is [`R5tuFile`]. Use it to open a `.r5tu` +//! file and query logical graph groups by dataset id and graph name. +//! See `ARCH.md` §4 for query semantics and identifiers. +//! +//! Basic example +//! +//! ```no_run +//! use rdf5d::R5tuFile; +//! use std::path::Path; +//! +//! let f = R5tuFile::open(Path::new("example.r5tu")).expect("open"); +//! if let Some(gr) = f.resolve_gid("dataset:1", "http://example.org/graph").unwrap() { +//! let mut n = 0u64; +//! for (s, p, o) in f.triples_ids(gr.gid).unwrap() { n += 1; } +//! assert_eq!(n, gr.n_triples); +//! } +//! ``` + +use std::{fmt, fs, path::Path}; + +use crate::header::{ + Header, Section, SectionKind, TocEntry, crc32_ieee, parse_footer, parse_toc, section_in_bounds, +}; + +/// Errors that can arise when parsing or validating an R5TU file. +#[derive(Debug)] +pub enum R5Error { + /// Underlying I/O error. + Io(std::io::Error), + /// Structural problem with inputs or unsupported feature. + Invalid(&'static str), + /// The file failed an integrity or bounds check. + Corrupt(String), +} + +impl fmt::Display for R5Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + R5Error::Io(e) => write!(f, "{}", e), + R5Error::Invalid(m) => write!(f, "{}", m), + R5Error::Corrupt(m) => write!(f, "{}", m), + } + } +} +impl std::error::Error for R5Error {} +impl From for R5Error { + fn from(e: std::io::Error) -> Self { + R5Error::Io(e) + } +} + +pub type Result = std::result::Result; + +/// Lightweight description of a logical graph group inside an R5TU file. +#[derive(Debug, Clone)] +pub struct GraphRef { + /// Stable group id within the file. + pub gid: u64, + /// Dataset identifier (id) for the group (as stored in the id dictionary). + pub id: String, + /// Graph name (as stored in the graph name dictionary). + pub graphname: String, + /// Number of triples in this group. + pub n_triples: u64, +} + +#[derive(Debug)] +enum Backing { + Owned(Vec), + #[cfg(feature = "mmap")] + Mmap(memmap2::Mmap), +} + +impl Backing { + fn as_bytes(&self) -> &[u8] { + match self { + Backing::Owned(v) => v.as_slice(), + #[cfg(feature = "mmap")] + Backing::Mmap(m) => m, + } + } +} + +/// Opened R5TU file. Provides lookups and triple iteration. +#[derive(Debug)] +pub struct R5tuFile { + backing: Backing, + header: Header, + toc: Vec, + // sections + id_dict: Dict, + gname_dict: Dict, + term_dict: TermDict, + gdir: Section, + idx_id2gid: Section, + idx_gname2gid: Section, + idx_pair2gid: Section, + #[allow(dead_code)] + triple_blocks: Section, +} + +impl R5tuFile { + #[inline] + fn bytes(&self) -> &[u8] { + self.backing.as_bytes() + } + /// Open and validate an R5TU file from disk. + /// + /// Performs bounds checks, TOC validation, and optional section/global CRCs. + /// Returns a handle capable of dictionary lookups and triple iteration. + pub fn open(path: &Path) -> Result { + let data = fs::read(path)?; + let header = Header::parse(&data).ok_or(R5Error::Invalid("short or invalid header"))?; + if &header.magic != b"R5TU" { + return Err(R5Error::Invalid("bad magic")); + } + // basic header sanity + if header.toc_off_u64 as usize > data.len() { + return Err(R5Error::Corrupt("TOC offset out of bounds".into())); + } + let toc = + parse_toc(&data, &header).ok_or_else(|| R5Error::Corrupt("TOC parse failed".into()))?; + // verify sections lie within file and optional CRCs + for e in &toc { + if !section_in_bounds(data.len(), e.section) { + return Err(R5Error::Corrupt(format!( + "section {:?} out of bounds", + e.kind + ))); + } + if e.crc32_u32 != 0 { + let start = e.section.off as usize; + let end = start + e.section.len as usize; + if end > data.len() { + return Err(R5Error::Corrupt("section crc OOB".into())); + } + let got = crc32_ieee(&data[start..end]); + if got != e.crc32_u32 { + return Err(R5Error::Corrupt("section CRC mismatch".into())); + } + } + } + // Validate TOC ordering by offset and detect overlaps + let mut spans: Vec<(u64, u64)> = + toc.iter().map(|e| (e.section.off, e.section.len)).collect(); + spans.sort_by_key(|(off, _)| *off); + for w in spans.windows(2) { + let (a_off, a_len) = w[0]; + let (b_off, _b_len) = w[1]; + if a_off + a_len > b_off { + return Err(R5Error::Corrupt("TOC sections overlap or unsorted".into())); + } + } + // Resolve required sections + let need = |k: SectionKind| -> Result
{ + parse_toc(&data, &header) + .and_then(|t| t.into_iter().find(|e| e.kind == k).map(|e| e.section)) + .ok_or(R5Error::Invalid("missing required section")) + }; + let id_sec = need(SectionKind::IdDict)?; + let gn_sec = need(SectionKind::GNameDict)?; + let term_sec = need(SectionKind::TermDict)?; + let gdir = need(SectionKind::GDir)?; + let idx_id2gid = need(SectionKind::IdxId2Gid)?; + let idx_gname2gid = need(SectionKind::IdxGName2Gid)?; + let idx_pair2gid = need(SectionKind::IdxPair2Gid)?; + let triple_blocks = need(SectionKind::TripleBlocks)?; + + let id_dict = Dict::parse(&data, id_sec)?; + let gname_dict = Dict::parse(&data, gn_sec)?; + let term_dict = TermDict::parse(&data, term_sec)?; + + // Footer/global CRC + if let Some((footer_crc, magic)) = parse_footer(&data) { + if &magic != b"R5TU_ENDMARK" { + return Err(R5Error::Corrupt("bad footer magic".into())); + } + let got = crc32_ieee(&data[..data.len() - 16]); + if got != footer_crc { + return Err(R5Error::Corrupt("global CRC mismatch".into())); + } + } + Ok(Self { + backing: Backing::Owned(data), + header, + toc, + id_dict, + gname_dict, + term_dict, + gdir, + idx_id2gid, + idx_gname2gid, + idx_pair2gid, + triple_blocks, + }) + } + + #[cfg(feature = "mmap")] + /// Open and validate an R5TU file using `memmap2` for zero‑copy access. + /// + /// Enabled with the `mmap` feature. + pub fn open_mmap(path: &Path) -> Result { + use std::fs::File; + let f = File::open(path)?; + let mmap = unsafe { memmap2::MmapOptions::new().map(&f) }.map_err(R5Error::Io)?; + let data: &[u8] = &mmap; + let header = Header::parse(data).ok_or(R5Error::Invalid("short or invalid header"))?; + if &header.magic != b"R5TU" { + return Err(R5Error::Invalid("bad magic")); + } + if header.toc_off_u64 as usize > data.len() { + return Err(R5Error::Corrupt("TOC offset out of bounds".into())); + } + let toc = + parse_toc(data, &header).ok_or_else(|| R5Error::Corrupt("TOC parse failed".into()))?; + for e in &toc { + if !section_in_bounds(data.len(), e.section) { + return Err(R5Error::Corrupt(format!( + "section {:?} out of bounds", + e.kind + ))); + } + if e.crc32_u32 != 0 { + let start = e.section.off as usize; + let end = start + e.section.len as usize; + if end > data.len() { + return Err(R5Error::Corrupt("section crc OOB".into())); + } + let got = crc32_ieee(&data[start..end]); + if got != e.crc32_u32 { + return Err(R5Error::Corrupt("section CRC mismatch".into())); + } + } + } + // Resolve sections + let need = |k: SectionKind| -> Result
{ + parse_toc(data, &header) + .and_then(|t| t.into_iter().find(|e| e.kind == k).map(|e| e.section)) + .ok_or(R5Error::Invalid("missing required section")) + }; + let id_sec = need(SectionKind::IdDict)?; + let gn_sec = need(SectionKind::GNameDict)?; + let term_sec = need(SectionKind::TermDict)?; + let gdir = need(SectionKind::GDir)?; + let idx_id2gid = need(SectionKind::IdxId2Gid)?; + let idx_gname2gid = need(SectionKind::IdxGName2Gid)?; + let idx_pair2gid = need(SectionKind::IdxPair2Gid)?; + let triple_blocks = need(SectionKind::TripleBlocks)?; + // Footer/global CRC if present + if let Some((footer_crc, magic)) = parse_footer(data) { + if &magic != b"R5TU_ENDMARK" { + return Err(R5Error::Corrupt("bad footer magic".into())); + } + let got = crc32_ieee(&data[..data.len() - 16]); + if got != footer_crc { + return Err(R5Error::Corrupt("global CRC mismatch".into())); + } + } + let id_dict = Dict::parse(data, id_sec)?; + let gname_dict = Dict::parse(data, gn_sec)?; + let term_dict = TermDict::parse(data, term_sec)?; + Ok(Self { + backing: Backing::Mmap(mmap), + header, + toc, + id_dict, + gname_dict, + term_dict, + gdir, + idx_id2gid, + idx_gname2gid, + idx_pair2gid, + triple_blocks, + }) + } + + /// Returns the parsed file header. + pub fn header(&self) -> &Header { + &self.header + } + /// Returns the parsed table of contents (TOC). + pub fn toc(&self) -> &[TocEntry] { + &self.toc + } + + /// Finds a section by kind and returns its byte span, if present. + pub fn section(&self, kind: SectionKind) -> Option
{ + self.toc.iter().find(|e| e.kind == kind).map(|e| e.section) + } + + // API placeholders per ARCH.md §4.1 + /// Enumerate graph groups with a matching dataset id string. + pub fn enumerate_by_id(&self, id: &str) -> Result> { + let Some(id_id) = self.id_dict.find_id(self.bytes(), id) else { + return Ok(Vec::new()); + }; + self.postings_to_graphrefs(self.idx_id2gid, id_id as usize) + } + /// Enumerate graph groups with a matching graph name string. + pub fn enumerate_by_graphname(&self, gname: &str) -> Result> { + let Some(gn_id) = self.gname_dict.find_id(self.bytes(), gname) else { + return Ok(Vec::new()); + }; + self.postings_to_graphrefs(self.idx_gname2gid, gn_id as usize) + } + /// Resolve a (id, graphname) pair to a single group, if it exists. + pub fn resolve_gid(&self, id: &str, gname: &str) -> Result> { + let id_id = match self.id_dict.find_id(self.bytes(), id) { + Some(v) => v, + None => return Ok(None), + }; + let gn_id = match self.gname_dict.find_id(self.bytes(), gname) { + Some(v) => v, + None => return Ok(None), + }; + if let Some(gid) = self.pair_lookup(self.idx_pair2gid, id_id, gn_id)? { + let gr = self.graphref_for_gid(gid)?; + return Ok(Some(gr)); + } + Ok(None) + } + /// Iterate over triples (S, P, O) as term ids for the given `gid`. + /// + /// Convert term ids to strings with [`Self::term_to_string`]. + pub fn triples_ids(&self, gid: u64) -> Result { + self.decode_triple_block(gid) + } + /// Resolve a term id to a displayable string (IRI, bnode, or literal). + pub fn term_to_string(&self, term_id: u64) -> Result { + self.term_dict.term_to_string(self.bytes(), term_id) + } + + /// Internal helper: convert a term id into the writer's [`crate::writer::Term`]. + /// + /// Exposed as `pub(crate)` for modules that need to reconstruct quads + /// faithfully (e.g., update routines). + pub(crate) fn term_as_writer_term(&self, term_id: u64) -> Result { + let parts = self.term_dict.term_parts(self.bytes(), term_id)?; + let t = match parts { + TermParts::Iri(s) => crate::writer::Term::Iri(s), + TermParts::BNode(b) => crate::writer::Term::BNode(b), + TermParts::Literal { lex, dt, lang } => crate::writer::Term::Literal { lex, dt, lang }, + }; + Ok(t) + } + + #[cfg(feature = "oxigraph")] + pub fn to_oxigraph_graph(&self, gid: u64) -> Result { + use oxigraph::model::{BlankNode, Graph, Literal, NamedNode, NamedOrBlankNode, Triple}; + let mut g = Graph::new(); + for (s_id, p_id, o_id) in self.triples_ids(gid)? { + let s_parts = self.term_dict.term_parts(self.bytes(), s_id)?; + let p_parts = self.term_dict.term_parts(self.bytes(), p_id)?; + let o_parts = self.term_dict.term_parts(self.bytes(), o_id)?; + let s_nb: NamedOrBlankNode = match s_parts { + TermParts::Iri(s) => NamedNode::new(s) + .map_err(|_| R5Error::Invalid("invalid subject IRI"))? + .into(), + TermParts::BNode(label) => { + let lbl = label.strip_prefix("_:").unwrap_or(&label).to_string(); + BlankNode::new(lbl) + .map_err(|_| R5Error::Invalid("invalid blank node"))? + .into() + } + TermParts::Literal { .. } => return Err(R5Error::Invalid("literal subject")), + }; + let p_nn = match p_parts { + TermParts::Iri(p) => { + NamedNode::new(p).map_err(|_| R5Error::Invalid("invalid predicate IRI"))? + } + _ => return Err(R5Error::Invalid("non-IRI predicate")), + }; + let o_term: oxigraph::model::Term = match o_parts { + TermParts::Iri(o) => NamedNode::new(o) + .map_err(|_| R5Error::Invalid("invalid object IRI"))? + .into(), + TermParts::BNode(label) => { + let lbl = label.strip_prefix("_:").unwrap_or(&label).to_string(); + BlankNode::new(lbl) + .map_err(|_| R5Error::Invalid("invalid blank node"))? + .into() + } + TermParts::Literal { lex, dt, lang } => { + if let Some(dt) = dt { + let nn = NamedNode::new(dt) + .map_err(|_| R5Error::Invalid("invalid datatype IRI"))?; + Literal::new_typed_literal(lex, nn).into() + } else if let Some(lang) = lang { + Literal::new_language_tagged_literal(lex, lang) + .map_err(|_| R5Error::Invalid("invalid lang tag"))? + .into() + } else { + Literal::new_simple_literal(lex).into() + } + } + }; + g.insert(&Triple::new(s_nb, p_nn, o_term)); + } + Ok(g) + } + + #[cfg(feature = "oxigraph")] + pub fn oxigraph_triples<'a>(&'a self, gid: u64) -> Result> { + let inner = self.triples_ids(gid)?; + Ok(OxTripleIter { file: self, inner }) + } + + // Enumerate all graphs across all graphnames. + pub fn enumerate_all(&self) -> Result> { + let (n_rows, _) = self.gdir_header()?; + let mut out = Vec::with_capacity(n_rows as usize); + for gid in 0..n_rows { + out.push(self.graphref_for_gid(gid)?); + } + Ok(out) + } +} + +// ---------------- Dicts (ID/GNAME) ---------------- +#[derive(Debug, Clone, Copy)] +struct Dict { + #[allow(dead_code)] + sec: Section, + n: u32, + blob: Section, + offs: Section, + idx: Option
, +} + +impl Dict { + fn parse(data: &[u8], sec: Section) -> Result { + if !section_in_bounds(data.len(), sec) { + return Err(R5Error::Corrupt("dict section OOB".into())); + } + let base = sec.off as usize; + if base + 52 > data.len() { + return Err(R5Error::Corrupt("short dict header".into())); + } + let n = u32::from_le_bytes([data[base], data[base + 1], data[base + 2], data[base + 3]]); + let read_u64 = |o: usize| -> u64 { + u64::from_le_bytes([ + data[o], + data[o + 1], + data[o + 2], + data[o + 3], + data[o + 4], + data[o + 5], + data[o + 6], + data[o + 7], + ]) + }; + let blob_off = read_u64(base + 4); + let blob_len = read_u64(base + 12); + let offs_off = read_u64(base + 20); + let offs_len = read_u64(base + 28); + let idx_off = read_u64(base + 36); + let idx_len = read_u64(base + 44); + + let blob = Section { + off: blob_off, + len: blob_len, + }; + let offs = Section { + off: offs_off, + len: offs_len, + }; + let idx = if idx_off != 0 { + Some(Section { + off: idx_off, + len: idx_len, + }) + } else { + None + }; + if !section_in_bounds(data.len(), blob) || !section_in_bounds(data.len(), offs) { + return Err(R5Error::Corrupt("dict blob/offs OOB".into())); + } + if let Some(s) = idx + && !section_in_bounds(data.len(), s) + { + return Err(R5Error::Corrupt("dict index OOB".into())); + } + Ok(Dict { + sec, + n, + blob, + offs, + idx, + }) + } + + fn get<'a>(&self, data: &'a [u8], id: u32) -> Option<&'a str> { + if id >= self.n { + return None; + } + let o_base = self.offs.off as usize; + let s = u32::from_le_bytes( + data[o_base + id as usize * 4..o_base + id as usize * 4 + 4] + .try_into() + .ok()?, + ) as usize; + let e = u32::from_le_bytes( + data[o_base + (id as usize + 1) * 4..o_base + (id as usize + 1) * 4 + 4] + .try_into() + .ok()?, + ) as usize; + let b_base = self.blob.off as usize; + std::str::from_utf8(&data[b_base + s..b_base + e]).ok() + } + + fn find_id(&self, data: &[u8], s: &str) -> Option { + if let Some(idx) = self.idx { + let ib = idx.off as usize; + let n = self.n as usize; + let mut key16 = [0u8; 16]; + for (i, b) in s + .to_ascii_lowercase() + .as_bytes() + .iter() + .take(16) + .enumerate() + { + key16[i] = *b; + } + let mut lo = 0usize; + let mut hi = n; + while lo < hi { + let mid = (lo + hi) / 2; + let off = ib + mid * 24; + let k = &data[off..off + 16]; + use std::cmp::Ordering::*; + match k.cmp(&key16) { + Less => lo = mid + 1, + Greater => hi = mid, + Equal => { + // scan neighbors with identical key16 + let mut m = mid; + while m > 0 && &data[ib + (m - 1) * 24..ib + (m - 1) * 24 + 16] == k { + m -= 1; + } + while m < n && &data[ib + m * 24..ib + m * 24 + 16] == k { + let id = u32::from_le_bytes( + data[ib + m * 24 + 16..ib + m * 24 + 20].try_into().ok()?, + ); + if let Some(ss) = self.get(data, id) + && ss == s + { + return Some(id); + } + m += 1; + } + return None; + } + } + } + None + } else { + // fallback linear search + for i in 0..self.n { + if let Some(ss) = self.get(data, i) + && ss == s + { + return Some(i); + } + } + None + } + } +} + +// ---------------- Term Dict ---------------- +#[derive(Debug, Clone, Copy)] +struct TermDict { + n_terms: u64, + kinds_off: u64, + data_off: u64, + offs_off: u64, +} + +impl TermDict { + fn parse(data: &[u8], sec: Section) -> Result { + if !section_in_bounds(data.len(), sec) { + return Err(R5Error::Corrupt("term dict OOB".into())); + } + let base = sec.off as usize; + if base + 1 + 8 * 4 > data.len() { + return Err(R5Error::Corrupt("short term dict header".into())); + } + let _width = data[base]; // reserved + let n_terms = u64::from_le_bytes(data[base + 1..base + 9].try_into().unwrap()); + let kinds_off = u64::from_le_bytes(data[base + 9..base + 17].try_into().unwrap()); + let data_off = u64::from_le_bytes(data[base + 17..base + 25].try_into().unwrap()); + let offs_off = u64::from_le_bytes(data[base + 25..base + 33].try_into().unwrap()); + Ok(TermDict { + n_terms, + kinds_off, + data_off, + offs_off, + }) + } + + fn term_to_string(&self, data: &[u8], term_id: u64) -> Result { + if term_id >= self.n_terms { + return Err(R5Error::Invalid("term id out of range")); + } + let kinds_off = self.kinds_off as usize; + let data_off = self.data_off as usize; + let offs_off = self.offs_off as usize; + // offs is u64 * (n+1) + let s = u64::from_le_bytes( + data[offs_off + term_id as usize * 8..offs_off + term_id as usize * 8 + 8] + .try_into() + .unwrap(), + ) as usize; + let e = u64::from_le_bytes( + data[offs_off + (term_id as usize + 1) * 8..offs_off + (term_id as usize + 1) * 8 + 8] + .try_into() + .unwrap(), + ) as usize; + let payload = &data[data_off + s..data_off + e]; + match data[kinds_off + term_id as usize] { + 0 | 1 => std::str::from_utf8(payload) + .map(String::from) + .map_err(|_| R5Error::Corrupt("utf8".into())), + 2 => { + let (lex_len, mut off) = + read_uvarint(payload, 0).ok_or_else(|| R5Error::Corrupt("lit lex".into()))?; + let lex = std::str::from_utf8(&payload[off..off + lex_len as usize]) + .map_err(|_| R5Error::Corrupt("utf8".into()))?; + off += lex_len as usize; + if off >= payload.len() { + return Err(R5Error::Corrupt("lit bounds".into())); + } + let has_dt = payload[off]; + off += 1; + let dt = if has_dt == 1 { + let (l, o2) = read_uvarint(payload, off) + .ok_or_else(|| R5Error::Corrupt("dt len".into()))?; + let s = std::str::from_utf8(&payload[o2..o2 + l as usize]) + .map_err(|_| R5Error::Corrupt("utf8".into()))?; + off = o2 + l as usize; + Some(s.to_string()) + } else { + None + }; + if off >= payload.len() { + return Err(R5Error::Corrupt("lit bounds2".into())); + } + let has_lang = payload[off]; + off += 1; + let lang = if has_lang == 1 { + let (l, o2) = read_uvarint(payload, off) + .ok_or_else(|| R5Error::Corrupt("lang len".into()))?; + let s = std::str::from_utf8(&payload[o2..o2 + l as usize]) + .map_err(|_| R5Error::Corrupt("utf8".into()))?; + Some(s.to_string()) + } else { + None + }; + Ok(match (dt, lang) { + (Some(dt), _) => format!("\"{}\"^^<{}>", lex, dt), + (None, Some(lang)) => format!("\"{}\"@{}", lex, lang), + _ => format!("\"{}\"", lex), + }) + } + _ => Err(R5Error::Corrupt("unknown term kind".into())), + } + } + + // Exposed for internal crate use (e.g., update module) to faithfully + // reconstruct writer terms from an existing file without going through + // a third-party representation. + pub(crate) fn term_parts(&self, data: &[u8], term_id: u64) -> Result { + if term_id >= self.n_terms { + return Err(R5Error::Invalid("term id out of range")); + } + let kinds_off = self.kinds_off as usize; + let data_off = self.data_off as usize; + let offs_off = self.offs_off as usize; + let s = u64::from_le_bytes( + data[offs_off + term_id as usize * 8..offs_off + term_id as usize * 8 + 8] + .try_into() + .unwrap(), + ) as usize; + let e = u64::from_le_bytes( + data[offs_off + (term_id as usize + 1) * 8..offs_off + (term_id as usize + 1) * 8 + 8] + .try_into() + .unwrap(), + ) as usize; + let payload = &data[data_off + s..data_off + e]; + Ok(match data[kinds_off + term_id as usize] { + 0 => TermParts::Iri( + std::str::from_utf8(payload) + .map_err(|_| R5Error::Corrupt("utf8".into()))? + .to_string(), + ), + 1 => TermParts::BNode( + std::str::from_utf8(payload) + .map_err(|_| R5Error::Corrupt("utf8".into()))? + .to_string(), + ), + 2 => { + let (lex_len, mut off) = + read_uvarint(payload, 0).ok_or_else(|| R5Error::Corrupt("lit lex".into()))?; + let lex = std::str::from_utf8(&payload[off..off + lex_len as usize]) + .map_err(|_| R5Error::Corrupt("utf8".into()))? + .to_string(); + off += lex_len as usize; + if off >= payload.len() { + return Err(R5Error::Corrupt("lit bounds".into())); + } + let has_dt = payload[off]; + off += 1; + let dt = if has_dt == 1 { + let (l, o2) = read_uvarint(payload, off) + .ok_or_else(|| R5Error::Corrupt("dt len".into()))?; + off = o2; + let s = std::str::from_utf8(&payload[off..off + l as usize]) + .map_err(|_| R5Error::Corrupt("utf8".into()))? + .to_string(); + off += l as usize; + Some(s) + } else { + None + }; + if off >= payload.len() { + return Err(R5Error::Corrupt("lit bounds2".into())); + } + let has_lang = payload[off]; + off += 1; + let lang = if has_lang == 1 { + let (l, o2) = read_uvarint(payload, off) + .ok_or_else(|| R5Error::Corrupt("lang len".into()))?; + off = o2; + let s = std::str::from_utf8(&payload[off..off + l as usize]) + .map_err(|_| R5Error::Corrupt("utf8".into()))? + .to_string(); + Some(s) + } else { + None + }; + TermParts::Literal { lex, dt, lang } + } + _ => return Err(R5Error::Corrupt("unknown term kind".into())), + }) + } +} + +#[derive(Debug, Clone)] +pub(crate) enum TermParts { + Iri(String), + BNode(String), + Literal { + lex: String, + dt: Option, + lang: Option, + }, +} + +#[cfg(feature = "oxigraph")] +pub struct OxTripleIter<'a> { + file: &'a R5tuFile, + inner: TripleIter, +} + +#[cfg(feature = "oxigraph")] +impl<'a> Iterator for OxTripleIter<'a> { + type Item = Result; + fn next(&mut self) -> Option { + use oxigraph::model::{BlankNode, Literal, NamedNode, NamedOrBlankNode, Triple}; + let (s_id, p_id, o_id) = self.inner.next()?; + let bytes = self.file.bytes(); + let s_parts = match self.file.term_dict.term_parts(bytes, s_id) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + }; + let p_parts = match self.file.term_dict.term_parts(bytes, p_id) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + }; + let o_parts = match self.file.term_dict.term_parts(bytes, o_id) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + }; + let s_nb: NamedOrBlankNode = match s_parts { + TermParts::Iri(s) => match NamedNode::new(s) { + Ok(n) => n.into(), + Err(_) => return Some(Err(R5Error::Invalid("invalid subject IRI"))), + }, + TermParts::BNode(label) => { + let lbl = label.strip_prefix("_:").unwrap_or(&label).to_string(); + match BlankNode::new(lbl) { + Ok(b) => b.into(), + Err(_) => return Some(Err(R5Error::Invalid("invalid blank node"))), + } + } + TermParts::Literal { .. } => return Some(Err(R5Error::Invalid("literal subject"))), + }; + let p_nn = match p_parts { + TermParts::Iri(p) => match NamedNode::new(p) { + Ok(n) => n, + Err(_) => return Some(Err(R5Error::Invalid("invalid predicate IRI"))), + }, + _ => return Some(Err(R5Error::Invalid("non-IRI predicate"))), + }; + let o_term: oxigraph::model::Term = match o_parts { + TermParts::Iri(o) => match NamedNode::new(o) { + Ok(n) => n.into(), + Err(_) => return Some(Err(R5Error::Invalid("invalid object IRI"))), + }, + TermParts::BNode(label) => { + let lbl = label.strip_prefix("_:").unwrap_or(&label).to_string(); + match BlankNode::new(lbl) { + Ok(b) => b.into(), + Err(_) => return Some(Err(R5Error::Invalid("invalid blank node"))), + } + } + TermParts::Literal { lex, dt, lang } => { + if let Some(dt) = dt { + let nn = match NamedNode::new(dt) { + Ok(n) => n, + Err(_) => return Some(Err(R5Error::Invalid("invalid datatype IRI"))), + }; + Literal::new_typed_literal(lex, nn).into() + } else if let Some(lang) = lang { + match Literal::new_language_tagged_literal(lex, lang) { + Ok(l) => l.into(), + Err(_) => return Some(Err(R5Error::Invalid("invalid lang tag"))), + } + } else { + Literal::new_simple_literal(lex).into() + } + } + }; + Some(Ok(Triple::new(s_nb, p_nn, o_term))) + } +} + +// ---------------- GDIR and GraphRefs ---------------- +#[derive(Debug, Clone, Copy)] +struct GDirRow { + id_id: u32, + gn_id: u32, + triples_off: u64, + triples_len: u64, + n_triples: u64, + #[allow(dead_code)] + n_s: u32, + #[allow(dead_code)] + n_p: u32, + #[allow(dead_code)] + n_o: u32, +} + +impl R5tuFile { + fn gdir_header(&self) -> Result<(u64, usize)> { + let bytes = self.bytes(); + let base = self.gdir.off as usize; + if base + 16 > bytes.len() { + return Err(R5Error::Corrupt("gdir header OOB".into())); + } + let n_rows = u64::from_le_bytes(bytes[base..base + 8].try_into().unwrap()); + let row_size = u32::from_le_bytes(bytes[base + 8..base + 12].try_into().unwrap()) as usize; + Ok((n_rows, row_size)) + } + + fn gdir_row(&self, gid: u64) -> Result { + let (n_rows, row_size) = self.gdir_header()?; + if gid >= n_rows { + return Err(R5Error::Invalid("gid out of range")); + } + let bytes = self.bytes(); + let off = self.gdir.off as usize + 16 + gid as usize * row_size; + if off + row_size > bytes.len() { + return Err(R5Error::Corrupt("gdir row OOB".into())); + } + let b = &bytes[off..off + row_size]; + Ok(GDirRow { + id_id: u32::from_le_bytes(b[0..4].try_into().unwrap()), + gn_id: u32::from_le_bytes(b[4..8].try_into().unwrap()), + triples_off: u64::from_le_bytes(b[8..16].try_into().unwrap()), + triples_len: u64::from_le_bytes(b[16..24].try_into().unwrap()), + n_triples: u64::from_le_bytes(b[24..32].try_into().unwrap()), + n_s: u32::from_le_bytes(b[32..36].try_into().unwrap()), + n_p: u32::from_le_bytes(b[36..40].try_into().unwrap()), + n_o: u32::from_le_bytes(b[40..44].try_into().unwrap()), + }) + } + + fn graphref_for_gid(&self, gid: u64) -> Result { + let row = self.gdir_row(gid)?; + let bytes = self.bytes(); + let id = self + .id_dict + .get(bytes, row.id_id) + .ok_or(R5Error::Corrupt("id str OOB".into()))? + .to_string(); + let graphname = self + .gname_dict + .get(bytes, row.gn_id) + .ok_or(R5Error::Corrupt("gname str OOB".into()))? + .to_string(); + Ok(GraphRef { + gid, + id, + graphname, + n_triples: row.n_triples, + }) + } +} + +// ---------------- Postings & Pair index ---------------- +impl R5tuFile { + fn postings_to_graphrefs(&self, sec: Section, key_ordinal: usize) -> Result> { + let gids = self.decode_posting_list(sec, key_ordinal)?; + let mut out = Vec::with_capacity(gids.len()); + for gid in gids { + out.push(self.graphref_for_gid(gid)?); + } + Ok(out) + } + + fn decode_posting_list(&self, sec: Section, key_ordinal: usize) -> Result> { + let data_all = self.bytes(); + let b = &data_all[sec.off as usize..(sec.off + sec.len) as usize]; + if b.len() < 24 { + return Err(R5Error::Corrupt("postings header short".into())); + } + let n_keys = u64::from_le_bytes(b[0..8].try_into().unwrap()) as usize; + if key_ordinal >= n_keys { + return Ok(vec![]); + } + let offs_off = u64::from_le_bytes(b[8..16].try_into().unwrap()) as usize; + let blob_off = u64::from_le_bytes(b[16..24].try_into().unwrap()) as usize; + let data = self.bytes(); + if offs_off + (n_keys + 1) * 8 > data.len() { + return Err(R5Error::Corrupt("postings offs OOB".into())); + } + let s = u64::from_le_bytes( + data[offs_off + key_ordinal * 8..offs_off + key_ordinal * 8 + 8] + .try_into() + .unwrap(), + ) as usize; + let e = u64::from_le_bytes( + data[offs_off + (key_ordinal + 1) * 8..offs_off + (key_ordinal + 1) * 8 + 8] + .try_into() + .unwrap(), + ) as usize; + if blob_off + e > data.len() || blob_off + s > data.len() || s > e { + return Err(R5Error::Corrupt("postings blob OOB".into())); + } + let mut off = blob_off + s; + let end = blob_off + e; + let (n, o1) = + read_uvarint(data, off).ok_or_else(|| R5Error::Corrupt("postings n".into()))?; + off = o1; + if n == 0 { + return Ok(vec![]); + } + let (first, o_after_first) = + read_uvarint(data, off).ok_or_else(|| R5Error::Corrupt("postings first".into()))?; + off = o_after_first; + let mut out = Vec::with_capacity(n as usize); + out.push(first); + let mut cur = first; + for _ in 1..n { + if off >= end { + return Err(R5Error::Corrupt("postings truncated".into())); + } + let (d, o2) = + read_uvarint(data, off).ok_or_else(|| R5Error::Corrupt("postings delta".into()))?; + off = o2; + cur = cur + .checked_add(d) + .ok_or_else(|| R5Error::Corrupt("postings overflow".into()))?; + out.push(cur); + } + Ok(out) + } + + fn pair_lookup(&self, sec: Section, id_id: u32, gn_id: u32) -> Result> { + let data = self.bytes(); + let b = &data[sec.off as usize..(sec.off + sec.len) as usize]; + if b.len() < 16 { + return Err(R5Error::Corrupt("pair idx short".into())); + } + let n_pairs = u64::from_le_bytes(b[0..8].try_into().unwrap()) as usize; + let pairs_off = u64::from_le_bytes(b[8..16].try_into().unwrap()) as usize; + let entry_size = 16usize; + if pairs_off + n_pairs * entry_size > data.len() { + return Err(R5Error::Corrupt("pairs OOB".into())); + } + let mut lo = 0usize; + let mut hi = n_pairs; + while lo < hi { + let mid = (lo + hi) / 2; + let off = pairs_off + mid * entry_size; + let mid_id = u32::from_le_bytes(data[off..off + 4].try_into().unwrap()); + let mid_gn = u32::from_le_bytes(data[off + 4..off + 8].try_into().unwrap()); + use std::cmp::Ordering::*; + match (mid_id, mid_gn).cmp(&(id_id, gn_id)) { + Less => lo = mid + 1, + Greater => hi = mid, + Equal => { + let gid = u64::from_le_bytes(data[off + 8..off + 16].try_into().unwrap()); + return Ok(Some(gid)); + } + } + } + Ok(None) + } +} + +// ---------------- Utilities ---------------- +fn read_uvarint(buf: &[u8], mut off: usize) -> Option<(u64, usize)> { + let (mut x, mut s) = (0u64, 0u32); + for _ in 0..10 { + let b = *buf.get(off)? as u64; + off += 1; + x |= (b & 0x7f) << s; + if b & 0x80 == 0 { + return Some((x, off)); + } + s += 7; + } + None +} + +// ---------------- Triple blocks ---------------- +#[derive(Debug)] +pub struct TripleIter { + s_vals: Vec, + s_heads: Vec, + p_vals: Vec, + p_heads: Vec, + o_vals: Vec, + si: usize, + pi: usize, + oi: usize, +} + +impl Iterator for TripleIter { + type Item = (u64, u64, u64); + fn next(&mut self) -> Option { + if self.oi >= self.o_vals.len() { + return None; + } + while self.pi + 1 < self.p_heads.len() && self.p_heads[self.pi + 1] <= self.oi as u64 { + self.pi += 1; + } + while self.si + 1 < self.s_heads.len() && self.s_heads[self.si + 1] <= self.pi as u64 { + self.si += 1; + } + let s = self.s_vals[self.si]; + let p = self.p_vals[self.pi]; + let o = self.o_vals[self.oi]; + self.oi += 1; + Some((s, p, o)) + } +} + +impl R5tuFile { + fn decode_triple_block(&self, gid: u64) -> Result { + let row = self.gdir_row(gid)?; + let data = self.bytes(); + let base = row.triples_off as usize; + let end = base + .checked_add(row.triples_len as usize) + .ok_or_else(|| R5Error::Corrupt("block bounds".into()))?; + if end > data.len() { + return Err(R5Error::Corrupt("block OOB".into())); + } + if base + 1 + 4 > end { + return Err(R5Error::Corrupt("block header short".into())); + } + let enc = data[base]; + let raw_len = u32::from_le_bytes(data[base + 1..base + 5].try_into().unwrap()) as usize; + let payload_start = base + 5; + match enc { + 0 => { + if payload_start + raw_len > end { + return Err(R5Error::Corrupt("raw len OOB".into())); + } + let raw = &data[payload_start..payload_start + raw_len]; + self.decode_raw_payload(raw) + } + 1 => { + #[cfg(feature = "zstd")] + { + if payload_start + raw_len > end { + return Err(R5Error::Corrupt("zstd len OOB".into())); + } + let frame = &data[payload_start..payload_start + raw_len]; + let raw = zstd::decode_all(std::io::Cursor::new(frame)) + .map_err(|_| R5Error::Corrupt("zstd decode".into()))?; + self.decode_raw_payload(&raw) + } + #[cfg(not(feature = "zstd"))] + { + Err(R5Error::Invalid("zstd feature not enabled")) + } + } + _ => Err(R5Error::Corrupt("unknown block encoding".into())), + } + } + + fn decode_raw_payload(&self, raw: &[u8]) -> Result { + let mut off = 0usize; + let (n_s, o1) = read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("nS".into()))?; + off = o1; + let (n_p, o2) = read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("nP".into()))?; + off = o2; + let (n_t, o3) = read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("nT".into()))?; + off = o3; + let n_s = n_s as usize; + let n_p = n_p as usize; + let n_t = n_t as usize; + // S_vals (delta-coded ascending) + let mut s_vals = Vec::with_capacity(n_s); + if n_s > 0 { + let (first, o) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("S first".into()))?; + off = o; + s_vals.push(first); + for _ in 1..n_s { + let (d, o2) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("S delta".into()))?; + off = o2; + let prev = *s_vals.last().unwrap(); + s_vals.push( + prev.checked_add(d) + .ok_or_else(|| R5Error::Corrupt("S overflow".into()))?, + ); + } + } + // S_heads (prefix sums into P) + let mut s_heads = Vec::with_capacity(n_s + 1); + for _ in 0..(n_s + 1) { + let (v, o) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("S_heads".into()))?; + off = o; + s_heads.push(v); + } + if *s_heads.last().unwrap_or(&0) as usize != n_p { + return Err(R5Error::Corrupt("S_heads last != nP".into())); + } + + // P_vals (delta-coded per S-run) + let mut p_vals = vec![0u64; n_p]; + for s in 0..n_s { + let start = s_heads[s] as usize; + let end = s_heads[s + 1] as usize; + if start > end || end > n_p { + return Err(R5Error::Corrupt("P run OOB".into())); + } + if start == end { + continue; + } + // first absolute in run + let (first, o) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("P first".into()))?; + off = o; + p_vals[start] = first; + let mut cur = first; + for v in p_vals[start + 1..end].iter_mut() { + let (d, o2) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("P delta".into()))?; + off = o2; + cur = cur + .checked_add(d) + .ok_or_else(|| R5Error::Corrupt("P overflow".into()))?; + *v = cur; + } + } + // P_heads (prefix sums into O) + let mut p_heads = Vec::with_capacity(n_p + 1); + for _ in 0..(n_p + 1) { + let (v, o) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("P_heads".into()))?; + off = o; + p_heads.push(v); + } + if *p_heads.last().unwrap_or(&0) as usize != n_t { + return Err(R5Error::Corrupt("P_heads last != nT".into())); + } + + // O_vals (delta-coded per (S,P)-run) + let mut o_vals = vec![0u64; n_t]; + for p in 0..n_p { + let start = p_heads[p] as usize; + let end = p_heads[p + 1] as usize; + if start > end || end > n_t { + return Err(R5Error::Corrupt("O run OOB".into())); + } + if start == end { + continue; + } + let (first, o) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("O first".into()))?; + off = o; + o_vals[start] = first; + let mut cur = first; + for v in o_vals[start + 1..end].iter_mut() { + let (d, o2) = + read_uvarint(raw, off).ok_or_else(|| R5Error::Corrupt("O delta".into()))?; + off = o2; + cur = cur + .checked_add(d) + .ok_or_else(|| R5Error::Corrupt("O overflow".into()))?; + *v = cur; + } + } + + Ok(TripleIter { + s_vals, + s_heads, + p_vals, + p_heads, + o_vals, + si: 0, + pi: 0, + oi: 0, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + fn push_uvarint(v: u64, out: &mut Vec) { + let mut x = v; + loop { + let mut b = (x & 0x7f) as u8; + x >>= 7; + if x != 0 { + b |= 0x80; + } + out.push(b); + if x == 0 { + break; + } + } + } + + // minimal smoke: invalid header rejected + #[test] + fn rejects_short_or_bad_magic() { + let mut path = std::env::temp_dir(); + path.push("bad.r5tu"); + let mut f = fs::File::create(&path).unwrap(); + f.write_all(b"NOPE").unwrap(); + let err = R5tuFile::open(&path).unwrap_err(); + match err { + R5Error::Invalid(_) => {} + _ => panic!("expected Invalid"), + } + let _ = fs::remove_file(&path); + } + + #[test] + fn uvarint_roundtrip_and_bounds() { + let mut buf = Vec::new(); + for &n in &[ + 0, + 1, + 127, + 128, + 255, + 16384, + u32::MAX as u64, + u64::from(u32::MAX) + 12345, + ] { + buf.clear(); + push_uvarint(n, &mut buf); + let (v, off) = read_uvarint(&buf, 0).unwrap(); + assert_eq!(v, n); + assert_eq!(off, buf.len()); + } + assert!(read_uvarint(&[], 0).is_none()); + } + + #[test] + fn dict_parse_and_lookup() { + // Build a minimal dict section inline: entries ["A", "B"] + let mut file = vec![0u8; 0]; + let sec_off = file.len(); + file.resize(file.len() + 52, 0); // dict header + // payload: blob and offs + let blob_off = file.len(); + file.extend_from_slice(b"AB"); + let offs_off = file.len(); + // offs [0,1,2] + for n in [0u32, 1, 2] { + file.extend_from_slice(&n.to_le_bytes()); + } + // fill header + let n = 2u32; + file[sec_off..sec_off + 4].copy_from_slice(&n.to_le_bytes()); + let blob_off_u64 = (blob_off as u64).to_le_bytes(); + let blob_len_u64 = (2u64).to_le_bytes(); + let offs_off_u64 = (offs_off as u64).to_le_bytes(); + let offs_len_u64 = (12u64).to_le_bytes(); + file[sec_off + 4..sec_off + 12].copy_from_slice(&blob_off_u64); + file[sec_off + 12..sec_off + 20].copy_from_slice(&blob_len_u64); + file[sec_off + 20..sec_off + 28].copy_from_slice(&offs_off_u64); + file[sec_off + 28..sec_off + 36].copy_from_slice(&offs_len_u64); + // idx absent (zeros) + + let dict = Dict::parse( + &file, + Section { + off: sec_off as u64, + len: (file.len() - sec_off) as u64, + }, + ) + .unwrap(); + assert_eq!(dict.get(&file, 0).unwrap(), "A"); + assert_eq!(dict.get(&file, 1).unwrap(), "B"); + assert_eq!(dict.find_id(&file, "A"), Some(0)); + assert_eq!(dict.find_id(&file, "B"), Some(1)); + assert_eq!(dict.find_id(&file, "Z"), None); + } + + #[test] + fn term_dict_decode() { + // One IRI, one literal with dt, one literal with lang + let mut file = vec![0u8; 0]; + let sec_off = file.len(); + file.resize(file.len() + 33, 0); // term dict header + // payload regions + // kinds: [0=IRI, 2=LITERAL, 2=LITERAL] + let kinds_off = file.len(); + file.extend_from_slice(&[0u8, 2, 2]); + // data blob + let data_off = file.len(); + // IRI payload is raw UTF-8 + let iri_bytes = b"http://ex/s"; + file.extend_from_slice(iri_bytes); + // Literal with dt: "42"^^ + let mut lit1 = Vec::new(); + push_uvarint(2, &mut lit1); + lit1.extend_from_slice(b"42"); // lex + lit1.push(1); // has_dt + push_uvarint(11, &mut lit1); + lit1.extend_from_slice(b"http://ex/i"); + lit1.push(0); // has_lang + file.extend_from_slice(&lit1); + // Literal with lang: "en"@en + let mut lit2 = Vec::new(); + push_uvarint(2, &mut lit2); + lit2.extend_from_slice(b"en"); + lit2.push(0); // no dt + lit2.push(1); // has_lang + push_uvarint(2, &mut lit2); + lit2.extend_from_slice(b"en"); + file.extend_from_slice(&lit2); + // offs: u64*(n+1) = 4 entries + let offs_off = file.len(); + let mut cur = 0u64; + let sizes = [iri_bytes.len() as u64, lit1.len() as u64, lit2.len() as u64]; + file.extend_from_slice(&cur.to_le_bytes()); + cur += sizes[0]; + file.extend_from_slice(&cur.to_le_bytes()); + cur += sizes[1]; + file.extend_from_slice(&cur.to_le_bytes()); + cur += sizes[2]; + file.extend_from_slice(&cur.to_le_bytes()); + // fill header + file[sec_off] = 0; // width + let n_terms = (3u64).to_le_bytes(); + file[sec_off + 1..sec_off + 9].copy_from_slice(&n_terms); + file[sec_off + 9..sec_off + 17].copy_from_slice(&(kinds_off as u64).to_le_bytes()); + file[sec_off + 17..sec_off + 25].copy_from_slice(&(data_off as u64).to_le_bytes()); + file[sec_off + 25..sec_off + 33].copy_from_slice(&(offs_off as u64).to_le_bytes()); + + let td = TermDict::parse( + &file, + Section { + off: sec_off as u64, + len: (file.len() - sec_off) as u64, + }, + ) + .unwrap(); + assert_eq!(td.term_to_string(&file, 0).unwrap(), "http://ex/s"); + assert_eq!( + td.term_to_string(&file, 1).unwrap(), + "\"42\"^^" + ); + assert_eq!(td.term_to_string(&file, 2).unwrap(), "\"en\"@en"); + } + + #[test] + fn end_to_end_minimal_file() { + // Build a minimal complete file per ARCH to exercise enumerate*, resolve_gid, and triples iterator. + let mut f = vec![0u8; 32]; // header placeholder + let mut toc_entries: Vec<(SectionKind, u64, u64)> = Vec::new(); + + // Helper to register a section + let mut add_sec = |kind: SectionKind, off: usize, len: usize| { + toc_entries.push((kind, off as u64, len as u64)); + }; + + // ID_DICT with ["A"] + let id_sec_off = f.len(); + f.resize(f.len() + 52, 0); + let id_blob_off = f.len(); + f.extend_from_slice(b"A"); + let id_offs_off = f.len(); + for n in [0u32, 1] { + f.extend_from_slice(&n.to_le_bytes()); + } + // fill header + f[id_sec_off..id_sec_off + 4].copy_from_slice(&1u32.to_le_bytes()); + f[id_sec_off + 4..id_sec_off + 12].copy_from_slice(&(id_blob_off as u64).to_le_bytes()); + f[id_sec_off + 12..id_sec_off + 20].copy_from_slice(&(1u64).to_le_bytes()); + f[id_sec_off + 20..id_sec_off + 28].copy_from_slice(&(id_offs_off as u64).to_le_bytes()); + f[id_sec_off + 28..id_sec_off + 36].copy_from_slice(&(8u64).to_le_bytes()); + add_sec(SectionKind::IdDict, id_sec_off, f.len() - id_sec_off); + + // GNAME_DICT with ["g"] + let gn_sec_off = f.len(); + f.resize(f.len() + 52, 0); + let gn_blob_off = f.len(); + f.extend_from_slice(b"g"); + let gn_offs_off = f.len(); + for n in [0u32, 1] { + f.extend_from_slice(&n.to_le_bytes()); + } + f[gn_sec_off..gn_sec_off + 4].copy_from_slice(&1u32.to_le_bytes()); + f[gn_sec_off + 4..gn_sec_off + 12].copy_from_slice(&(gn_blob_off as u64).to_le_bytes()); + f[gn_sec_off + 12..gn_sec_off + 20].copy_from_slice(&(1u64).to_le_bytes()); + f[gn_sec_off + 20..gn_sec_off + 28].copy_from_slice(&(gn_offs_off as u64).to_le_bytes()); + f[gn_sec_off + 28..gn_sec_off + 36].copy_from_slice(&(8u64).to_le_bytes()); + add_sec(SectionKind::GNameDict, gn_sec_off, f.len() - gn_sec_off); + + // TERM_DICT empty but valid + let td_sec_off = f.len(); + f.resize(f.len() + 33, 0); + let kinds_off = f.len(); // empty kinds + let data_off = f.len(); // empty data + let offs_off = f.len(); + f.extend_from_slice(&0u64.to_le_bytes()); // single 0 + f[td_sec_off] = 0; + f[td_sec_off + 1..td_sec_off + 9].copy_from_slice(&0u64.to_le_bytes()); + f[td_sec_off + 9..td_sec_off + 17].copy_from_slice(&(kinds_off as u64).to_le_bytes()); + f[td_sec_off + 17..td_sec_off + 25].copy_from_slice(&(data_off as u64).to_le_bytes()); + f[td_sec_off + 25..td_sec_off + 33].copy_from_slice(&(offs_off as u64).to_le_bytes()); + add_sec(SectionKind::TermDict, td_sec_off, f.len() - td_sec_off); + + // TRIPLE_BLOCKS with one RAW block for gid=0, triples: (1,2,3) and (1,4,5) + let tb_sec_off = f.len(); + let block_off = f.len(); + let mut raw = Vec::new(); + // nS=1, nP=2, nT=2 + push_uvarint(1, &mut raw); + push_uvarint(2, &mut raw); + push_uvarint(2, &mut raw); + // S_vals: [1] + push_uvarint(1, &mut raw); + // S_heads: [0,2] + push_uvarint(0, &mut raw); + push_uvarint(2, &mut raw); + // P_vals for S run: [2,4] + push_uvarint(2, &mut raw); // first absolute + push_uvarint(2, &mut raw); // delta = 2 (4-2) + // P_heads: [0,1,2] + push_uvarint(0, &mut raw); + push_uvarint(1, &mut raw); + push_uvarint(2, &mut raw); + // O_vals per P run: [3] then [5] + push_uvarint(3, &mut raw); + push_uvarint(5, &mut raw); + // block header + f.push(0u8); // enc = RAW + let raw_len = raw.len() as u32; + f.extend_from_slice(&raw_len.to_le_bytes()); + f.extend_from_slice(&raw); + let block_len = f.len() - block_off; + add_sec(SectionKind::TripleBlocks, tb_sec_off, f.len() - tb_sec_off); + + // GDIR with 1 row + let gdir_sec_off = f.len(); + // header + f.extend_from_slice(&1u64.to_le_bytes()); // n_rows + f.extend_from_slice(&56u32.to_le_bytes()); // row_size + f.extend_from_slice(&0u32.to_le_bytes()); // reserved + // row 0 + f.extend_from_slice(&0u32.to_le_bytes()); // id_id + f.extend_from_slice(&0u32.to_le_bytes()); // gn_id + f.extend_from_slice(&(block_off as u64).to_le_bytes()); + f.extend_from_slice(&(block_len as u64).to_le_bytes()); + f.extend_from_slice(&2u64.to_le_bytes()); // n_triples + f.extend_from_slice(&1u32.to_le_bytes()); // n_s + f.extend_from_slice(&2u32.to_le_bytes()); // n_p + f.extend_from_slice(&2u32.to_le_bytes()); // n_o + add_sec(SectionKind::GDir, gdir_sec_off, f.len() - gdir_sec_off); + + // IDX_ID2GID with 1 key -> [0] + let ididx_sec_off = f.len(); + let ididx_hdr_off = ididx_sec_off; + f.resize(f.len() + 24, 0); + let ididx_offs_off = f.len(); + for n in [0u64, 2u64] { + f.extend_from_slice(&n.to_le_bytes()); + } // blob len 2 bytes + let ididx_blob_off = f.len(); + let mut tmp = Vec::new(); + push_uvarint(1, &mut tmp); + push_uvarint(0, &mut tmp); + f.extend_from_slice(&tmp); + // header + f[ididx_hdr_off..ididx_hdr_off + 8].copy_from_slice(&1u64.to_le_bytes()); + f[ididx_hdr_off + 8..ididx_hdr_off + 16] + .copy_from_slice(&(ididx_offs_off as u64).to_le_bytes()); + f[ididx_hdr_off + 16..ididx_hdr_off + 24] + .copy_from_slice(&(ididx_blob_off as u64).to_le_bytes()); + add_sec( + SectionKind::IdxId2Gid, + ididx_sec_off, + f.len() - ididx_sec_off, + ); + + // IDX_GNAME2GID same + let gnidx_sec_off = f.len(); + let gnidx_hdr_off = gnidx_sec_off; + f.resize(f.len() + 24, 0); + let gnidx_offs_off = f.len(); + for n in [0u64, 2u64] { + f.extend_from_slice(&n.to_le_bytes()); + } + let gnidx_blob_off = f.len(); + let mut tmp2 = Vec::new(); + push_uvarint(1, &mut tmp2); + push_uvarint(0, &mut tmp2); + f.extend_from_slice(&tmp2); + f[gnidx_hdr_off..gnidx_hdr_off + 8].copy_from_slice(&1u64.to_le_bytes()); + f[gnidx_hdr_off + 8..gnidx_hdr_off + 16] + .copy_from_slice(&(gnidx_offs_off as u64).to_le_bytes()); + f[gnidx_hdr_off + 16..gnidx_hdr_off + 24] + .copy_from_slice(&(gnidx_blob_off as u64).to_le_bytes()); + add_sec( + SectionKind::IdxGName2Gid, + gnidx_sec_off, + f.len() - gnidx_sec_off, + ); + + // IDX_PAIR2GID with one pair (0,0)->0 + let pairidx_sec_off = f.len(); + let pairs_off = f.len() + 16; // header is 16 bytes + // header + f.extend_from_slice(&1u64.to_le_bytes()); + f.extend_from_slice(&(pairs_off as u64).to_le_bytes()); + // entry + f.extend_from_slice(&0u32.to_le_bytes()); + f.extend_from_slice(&0u32.to_le_bytes()); + f.extend_from_slice(&0u64.to_le_bytes()); + add_sec( + SectionKind::IdxPair2Gid, + pairidx_sec_off, + f.len() - pairidx_sec_off, + ); + + // TOC + let toc_off = f.len(); + for (kind, off, len) in &toc_entries { + let mut ent = [0u8; 32]; + let kind_u16 = *kind as u16; + ent[0..2].copy_from_slice(&kind_u16.to_le_bytes()); + ent[4..12].copy_from_slice(&off.to_le_bytes()); + ent[12..20].copy_from_slice(&len.to_le_bytes()); + // crc and reserved left zero + f.extend_from_slice(&ent); + } + + // Header + f[0..4].copy_from_slice(b"R5TU"); + f[4..6].copy_from_slice(&1u16.to_le_bytes()); + f[6..8].copy_from_slice(&0u16.to_le_bytes()); + f[8..16].copy_from_slice(&0u64.to_le_bytes()); // created + f[16..24].copy_from_slice(&(toc_off as u64).to_le_bytes()); + f[24..28].copy_from_slice(&(toc_entries.len() as u32).to_le_bytes()); + f[28..32].copy_from_slice(&0u32.to_le_bytes()); + + // Write and open + let mut path = std::env::temp_dir(); + path.push("mini.r5tu"); + let mut file = fs::File::create(&path).unwrap(); + file.write_all(&f).unwrap(); + let reader = R5tuFile::open(&path).unwrap(); + // enumerate by id + let v = reader.enumerate_by_id("A").unwrap(); + assert_eq!(v.len(), 1); + assert_eq!(v[0].id, "A"); + assert_eq!(v[0].graphname, "g"); + assert_eq!(v[0].n_triples, 2); + // enumerate by graphname + let w = reader.enumerate_by_graphname("g").unwrap(); + assert_eq!(w.len(), 1); + assert_eq!(w[0].id, "A"); + // resolve pair + let gr = reader.resolve_gid("A", "g").unwrap().unwrap(); + assert_eq!(gr.gid, 0); + // triples + let triples: Vec<_> = reader.triples_ids(gr.gid).unwrap().collect(); + assert_eq!(triples, vec![(1, 2, 3), (1, 4, 5)]); + let _ = fs::remove_file(&path); + } + + #[test] + fn rejects_overlapping_toc_sections() { + // Build two fake sections that overlap in TOC + let mut f = vec![0u8; 32]; + // add a blob region + let s1_off = f.len(); + f.extend_from_slice(&[0u8; 100]); + let s2_off = s1_off + 50; // intentional overlap + f.extend_from_slice(&[0u8; 20]); + // TOC + let toc_off = f.len(); + let mut ent1 = [0u8; 32]; + ent1[0..2].copy_from_slice(&(SectionKind::IdDict as u16).to_le_bytes()); + ent1[4..12].copy_from_slice(&(s1_off as u64).to_le_bytes()); + ent1[12..20].copy_from_slice(&(100u64).to_le_bytes()); + let mut ent2 = [0u8; 32]; + ent2[0..2].copy_from_slice(&(SectionKind::GNameDict as u16).to_le_bytes()); + ent2[4..12].copy_from_slice(&(s2_off as u64).to_le_bytes()); + ent2[12..20].copy_from_slice(&(20u64).to_le_bytes()); + f.extend_from_slice(&ent1); + f.extend_from_slice(&ent2); + // Header + f[0..4].copy_from_slice(b"R5TU"); + f[4..6].copy_from_slice(&1u16.to_le_bytes()); + f[6..8].copy_from_slice(&0u16.to_le_bytes()); + f[8..16].copy_from_slice(&0u64.to_le_bytes()); + f[16..24].copy_from_slice(&(toc_off as u64).to_le_bytes()); + f[24..28].copy_from_slice(&(2u32).to_le_bytes()); + f[28..32].copy_from_slice(&0u32.to_le_bytes()); + + let mut path = std::env::temp_dir(); + path.push("overlap.r5tu"); + let mut file = fs::File::create(&path).unwrap(); + file.write_all(&f).unwrap(); + let err = R5tuFile::open(&path).unwrap_err(); + match err { + R5Error::Corrupt(m) => assert!(m.contains("overlap")), + _ => panic!("expected Corrupt overlap"), + } + let _ = fs::remove_file(&path); + } +} diff --git a/rdf5d/src/update.rs b/rdf5d/src/update.rs new file mode 100644 index 0000000..29128d1 --- /dev/null +++ b/rdf5d/src/update.rs @@ -0,0 +1,123 @@ +//! Update helpers for replacing an entire logical graph in an existing file. +//! +//! The primary entry is [`replace_graph_with_options`], which reads an input +//! `.r5tu` file and writes a new file where the target (id, graphname) group is +//! fully replaced with provided triples, preserving all other groups. +//! +//! This performs a streaming rebuild using [`StreamingWriter`], avoiding +//! materializing the entire dataset in memory. It reconstructs writer terms from +//! the original file's term dictionary to faithfully copy unchanged graphs. +//! +//! Basic example +//! +//! ```no_run +//! use rdf5d::{replace_graph, Term}; +//! // New triples for the target graph +//! let new_triples = vec![ +//! ( +//! Term::Iri("http://ex/s1".into()), +//! Term::Iri("http://ex/p".into()), +//! Term::Literal { lex: "v2".into(), dt: None, lang: None }, +//! ) +//! ]; +//! replace_graph( +//! "in.r5tu", +//! "out.r5tu", +//! "src/A", +//! "http://example.org/graph", +//! &new_triples, +//! ).expect("update ok"); +//! ``` + +use std::path::Path; + +use crate::reader::{GraphRef, R5tuFile, Result}; +use crate::writer::{Quint, StreamingWriter, Term, WriterOptions}; + +fn copy_group_as_quints(file: &R5tuFile, gr: &GraphRef) -> Result> { + let mut out = Vec::with_capacity(gr.n_triples as usize); + let iter = file.triples_ids(gr.gid)?; + for (s_id, p_id, o_id) in iter { + let s = file.term_as_writer_term(s_id)?; + let p = file.term_as_writer_term(p_id)?; + let o = file.term_as_writer_term(o_id)?; + out.push(Quint { + id: gr.id.clone(), + s, + p, + o, + gname: gr.graphname.clone(), + }); + } + Ok(out) +} + +/// Replace one logical graph (matching `id` and `gname`) and write a new file. +/// +/// - Preserves all other graphs as-is. +/// - Rebuilds the file using [`StreamingWriter`] for determinism and integrity. +/// - Uses default writer options: no zstd, CRCs enabled. +pub fn replace_graph>( + src: P, + dst: P, + id: &str, + gname: &str, + new_triples: &[(Term, Term, Term)], +) -> Result<()> { + replace_graph_with_options( + src, + dst, + id, + gname, + new_triples, + WriterOptions { + zstd: false, + with_crc: true, + }, + ) +} + +/// Replace one logical graph (matching `id` and `gname`) and write a new file. +/// +/// - Preserves all other graphs as-is by reconstructing their triples from the +/// original term dictionary (no string lossy conversion). +/// - Rebuilds the file using [`StreamingWriter`] and given [`WriterOptions`]. +/// - The `new_triples` iterator yields `(s, p, o)` terms; the `id` and +/// `gname` are applied to every triple. +/// +/// Errors surface from input validation, decoding, or I/O per [`R5tuFile`]. +pub fn replace_graph_with_options>( + src: P, + dst: P, + id: &str, + gname: &str, + new_triples: &[(Term, Term, Term)], + opts: WriterOptions, +) -> Result<()> { + let f = R5tuFile::open(src.as_ref())?; + let mut w = StreamingWriter::new(dst.as_ref().to_path_buf(), opts); + + // 1) Copy all existing groups except the target + for gr in f.enumerate_all()? { + if gr.id == id && gr.graphname == gname { + continue; + } + for q in copy_group_as_quints(&f, &gr)? { + w.add(q)?; + } + } + + // 2) Insert replacement graph + for (s, p, o) in new_triples.iter().cloned() { + w.add(Quint { + id: id.to_string(), + s, + p, + o, + gname: gname.to_string(), + })?; + } + + // 3) Finalize (atomic write) + w.finalize() +} diff --git a/rdf5d/src/writer.rs b/rdf5d/src/writer.rs new file mode 100644 index 0000000..281bc53 --- /dev/null +++ b/rdf5d/src/writer.rs @@ -0,0 +1,1015 @@ +use std::collections::{BTreeMap, HashMap}; +use std::fs; +use std::path::{Path, PathBuf}; + +use crate::header::{Section, SectionKind, TocEntry, crc32_ieee}; +use crate::reader::{R5Error, Result}; + +// Simple type aliases to reduce type complexity noise +type GroupKey = (u32, u32); +type TripleIds = (u64, u64, u64); +type GroupsMap = BTreeMap>; +type GidRow = (u32, u32, Section, u64, u32, u32, u32); +type PairEntry = (u32, u32, u64); + +/// RDF term used by the writer when constructing quads. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Term { + /// IRI/URI node. + Iri(String), + /// Blank node label (with or without `_:` prefix). + BNode(String), + /// Literal with optional datatype or language tag. + Literal { + lex: String, + dt: Option, + lang: Option, + }, +} + +/// 5‑tuple (id, s, p, o, gname) used to build an R5TU file. +#[derive(Debug, Clone)] +pub struct Quint { + /// Dataset identifier for grouping. + pub id: String, + /// Subject term. + pub s: Term, + /// Predicate term. + pub p: Term, + /// Object term. + pub o: Term, + /// Graph name for grouping. + pub gname: String, +} + +fn push_uvarint(mut v: u64, out: &mut Vec) { + loop { + let mut b = (v & 0x7f) as u8; + v >>= 7; + if v != 0 { + b |= 0x80; + } + out.push(b); + if v == 0 { + break; + } + } +} + +/// Options controlling file emission. +#[derive(Debug, Clone, Copy, Default)] +pub struct WriterOptions { + /// Compress triple blocks using zstd (requires `zstd` feature). + pub zstd: bool, + /// Compute and embed per‑section CRCs (TOC) and a global footer CRC. + pub with_crc: bool, +} + +/// Convenience helper to write a `.r5tu` file with defaults. +/// +/// - `zstd = false` +/// - `with_crc = true` +/// +/// ```no_run +/// use rdf5d::{write_file, Quint, Term}; +/// let q = Quint { +/// id: "dataset:1".into(), +/// s: Term::Iri("http://example.org/Alice".into()), +/// p: Term::Iri("http://xmlns.com/foaf/0.1/name".into()), +/// o: Term::Literal { lex: "Alice".into(), dt: None, lang: None }, +/// gname: "http://example.org/graph".into(), +/// }; +/// write_file("example.r5tu", &[q]).unwrap(); +/// ``` +pub fn write_file>(path: P, quads: &[Quint]) -> Result<()> { + write_file_with_options( + path, + quads, + WriterOptions { + zstd: false, + with_crc: true, + }, + ) +} + +/// Write a `.r5tu` file with explicit [`WriterOptions`]. +pub fn write_file_with_options>( + path: P, + quads: &[Quint], + opts: WriterOptions, +) -> Result<()> { + // 1) Deduplicate ids, gnames, terms + let mut id_map: BTreeMap = BTreeMap::new(); + let mut gn_map: BTreeMap = BTreeMap::new(); + let mut term_map: HashMap = HashMap::new(); + let mut id_vec: Vec = Vec::new(); + let mut gn_vec: Vec = Vec::new(); + let mut term_vec: Vec = Vec::new(); + + let mut triples: Vec<(u32, u32, u64, u64, u64)> = Vec::new(); + // (id_id, gn_id, s_id, p_id, o_id) + + let mut intern_id = |s: &str| -> u32 { + if let Some(&v) = id_map.get(s) { + return v; + } + let v = id_vec.len() as u32; + id_vec.push(s.to_string()); + id_map.insert(s.to_string(), v); + v + }; + let mut intern_gn = |s: &str| -> u32 { + if let Some(&v) = gn_map.get(s) { + return v; + } + let v = gn_vec.len() as u32; + gn_vec.push(s.to_string()); + gn_map.insert(s.to_string(), v); + v + }; + let mut intern_term = |t: &Term| -> u64 { + if let Some(&v) = term_map.get(t) { + return v; + } + let v = term_vec.len() as u64; + term_vec.push(t.clone()); + term_map.insert(t.clone(), v); + v + }; + + for q in quads { + let id_id = intern_id(&q.id); + let gn_id = intern_gn(&q.gname); + let s_id = intern_term(&q.s); + let p_id = intern_term(&q.p); + let o_id = intern_term(&q.o); + triples.push((id_id, gn_id, s_id, p_id, o_id)); + } + + // 2) Group by (id_id, gn_id) and sort SPO + let mut groups: GroupsMap = BTreeMap::new(); + for (id_id, gn_id, s, p, o) in triples { + groups.entry((id_id, gn_id)).or_default().push((s, p, o)); + } + for v in groups.values_mut() { + v.sort_unstable(); + } + + // Buffers for sections + let mut file = vec![0u8; 32]; // header placeholder + let mut toc: Vec = Vec::new(); + + // ID_DICT + let id_sec = write_str_dict(&mut file, &id_vec)?; + toc.push(TocEntry { + kind: SectionKind::IdDict, + section: id_sec, + crc32_u32: 0, + }); + // GNAME_DICT + let gn_sec = write_str_dict(&mut file, &gn_vec)?; + toc.push(TocEntry { + kind: SectionKind::GNameDict, + section: gn_sec, + crc32_u32: 0, + }); + // TERM_DICT + let term_sec = write_term_dict(&mut file, &term_vec)?; + toc.push(TocEntry { + kind: SectionKind::TermDict, + section: term_sec, + crc32_u32: 0, + }); + + // TRIPLE_BLOCKS + let tb_off = file.len(); + let mut gid_rows: Vec = Vec::new(); + // For stable GID ordering, iterate groups in key order (BTreeMap) + for ((id_id, gn_id), spo) in &groups { + let start = file.len(); + // build RAW payload for this group + let raw = build_raw_spo(spo)?; + if opts.zstd { + #[cfg(feature = "zstd")] + { + file.push(1u8); // enc=ZSTD + let compressed = zstd::encode_all(&raw[..], 0) + .map_err(|_| R5Error::Corrupt("zstd encode".into()))?; + let clen = compressed.len() as u32; + file.extend_from_slice(&clen.to_le_bytes()); + file.extend_from_slice(&compressed); + } + #[cfg(not(feature = "zstd"))] + { + return Err(R5Error::Invalid("zstd feature not enabled")); + } + } else { + // RAW + file.push(0u8); + let raw_len = raw.len() as u32; + file.extend_from_slice(&raw_len.to_le_bytes()); + file.extend_from_slice(&raw); + } + let sec = Section { + off: start as u64, + len: (file.len() - start) as u64, + }; + // counts + let (n_s, n_p, n_t) = raw_counts(&raw)?; + gid_rows.push(( + *id_id, *gn_id, sec, n_t as u64, n_s as u32, n_p as u32, n_t as u32, + )); + } + let tb_sec = Section { + off: tb_off as u64, + len: (file.len() - tb_off) as u64, + }; + toc.push(TocEntry { + kind: SectionKind::TripleBlocks, + section: tb_sec, + crc32_u32: 0, + }); + + // GDIR + let gdir_off = file.len(); + let n_rows = gid_rows.len() as u64; + file.extend_from_slice(&n_rows.to_le_bytes()); + file.extend_from_slice(&44u32.to_le_bytes()); // row_size actually written below + file.extend_from_slice(&0u32.to_le_bytes()); // reserved + for (id_id, gn_id, sec, n_triples, n_s, n_p, n_o) in &gid_rows { + file.extend_from_slice(&id_id.to_le_bytes()); + file.extend_from_slice(&gn_id.to_le_bytes()); + file.extend_from_slice(&sec.off.to_le_bytes()); + file.extend_from_slice(&sec.len.to_le_bytes()); + file.extend_from_slice(&n_triples.to_le_bytes()); + file.extend_from_slice(&n_s.to_le_bytes()); + file.extend_from_slice(&n_p.to_le_bytes()); + file.extend_from_slice(&n_o.to_le_bytes()); + } + let gdir_sec = Section { + off: gdir_off as u64, + len: (file.len() - gdir_off) as u64, + }; + toc.push(TocEntry { + kind: SectionKind::GDir, + section: gdir_sec, + crc32_u32: 0, + }); + + // Build GID mapping for postings & pair index + let mut pair_entries: Vec = Vec::new(); + let mut id2gids: Vec> = vec![Vec::new(); id_vec.len()]; + let mut gn2gids: Vec> = vec![Vec::new(); gn_vec.len()]; + for (gid, (id_id, gn_id, _, _, _, _, _)) in gid_rows.iter().enumerate() { + let gid_u = gid as u64; + id2gids[*id_id as usize].push(gid_u); + gn2gids[*gn_id as usize].push(gid_u); + pair_entries.push((*id_id, *gn_id, gid_u)); + } + pair_entries.sort_unstable(); + + // IDX_ID2GID + let ididx_sec = write_postings_index(&mut file, &id2gids)?; + toc.push(TocEntry { + kind: SectionKind::IdxId2Gid, + section: ididx_sec, + crc32_u32: 0, + }); + // IDX_GNAME2GID + let gnidx_sec = write_postings_index(&mut file, &gn2gids)?; + toc.push(TocEntry { + kind: SectionKind::IdxGName2Gid, + section: gnidx_sec, + crc32_u32: 0, + }); + // IDX_PAIR2GID + let pairidx_sec = write_pair_index(&mut file, &pair_entries)?; + toc.push(TocEntry { + kind: SectionKind::IdxPair2Gid, + section: pairidx_sec, + crc32_u32: 0, + }); + + // TOC + let toc_off = file.len(); + for e in &toc { + let mut ent = [0u8; 32]; + let kind = e.kind as u16; + ent[0..2].copy_from_slice(&kind.to_le_bytes()); + // reserved_u16 zero + ent[4..12].copy_from_slice(&e.section.off.to_le_bytes()); + ent[12..20].copy_from_slice(&e.section.len.to_le_bytes()); + if opts.with_crc { + let start = e.section.off as usize; + let end = start + e.section.len as usize; + let crc = crc32_ieee(&file[start..end]); + ent[20..24].copy_from_slice(&crc.to_le_bytes()); + } + file.extend_from_slice(&ent); + } + + // Header + file[0..4].copy_from_slice(b"R5TU"); + file[4..6].copy_from_slice(&1u16.to_le_bytes()); // version + let mut flags: u16 = 0; + if opts.zstd { + flags |= 1 << 1; + } + file[6..8].copy_from_slice(&flags.to_le_bytes()); + file[8..16].copy_from_slice(&0u64.to_le_bytes()); // created + file[16..24].copy_from_slice(&(toc_off as u64).to_le_bytes()); + file[24..28].copy_from_slice(&(toc.len() as u32).to_le_bytes()); + file[28..32].copy_from_slice(&0u32.to_le_bytes()); + + // Footer with global CRC + let crc = crc32_ieee(&file[..]); + file.extend_from_slice(&crc.to_le_bytes()); + file.extend_from_slice(b"R5TU_ENDMARK"); + + // Atomic write (best-effort) + let tmp_path = path.as_ref().with_extension(".tmp.r5tu"); + fs::write(&tmp_path, &file).map_err(R5Error::Io)?; + fs::rename(&tmp_path, path).map_err(R5Error::Io)?; + Ok(()) +} + +// ---------------- Streaming writer ---------------- +/// Incremental builder for large datasets. +/// +/// Use [`StreamingWriter::add`] to append quads, then [`StreamingWriter::finalize`] +/// to write the file atomically. +#[derive(Debug)] +pub struct StreamingWriter { + opts: WriterOptions, + path: PathBuf, + id_map: BTreeMap, + gn_map: BTreeMap, + term_map: HashMap, + id_vec: Vec, + gn_vec: Vec, + term_vec: Vec, + groups: GroupsMap, +} + +impl StreamingWriter { + /// Create a streaming writer targeting `path` with `opts`. + pub fn new>(path: P, opts: WriterOptions) -> Self { + Self { + opts, + path: path.into(), + id_map: BTreeMap::new(), + gn_map: BTreeMap::new(), + term_map: HashMap::new(), + id_vec: Vec::new(), + gn_vec: Vec::new(), + term_vec: Vec::new(), + groups: BTreeMap::new(), + } + } + + fn intern_id(&mut self, s: &str) -> u32 { + if let Some(&v) = self.id_map.get(s) { + return v; + } + let v = self.id_vec.len() as u32; + self.id_vec.push(s.to_string()); + self.id_map.insert(s.to_string(), v); + v + } + fn intern_gn(&mut self, s: &str) -> u32 { + if let Some(&v) = self.gn_map.get(s) { + return v; + } + let v = self.gn_vec.len() as u32; + self.gn_vec.push(s.to_string()); + self.gn_map.insert(s.to_string(), v); + v + } + fn intern_term(&mut self, t: &Term) -> u64 { + if let Some(&v) = self.term_map.get(t) { + return v; + } + let v = self.term_vec.len() as u64; + self.term_vec.push(t.clone()); + self.term_map.insert(t.clone(), v); + v + } + + /// Add one 5‑tuple to the in‑memory builder. + pub fn add(&mut self, q: Quint) -> Result<()> { + let id_id = self.intern_id(&q.id); + let gn_id = self.intern_gn(&q.gname); + let s = self.intern_term(&q.s); + let p = self.intern_term(&q.p); + let o = self.intern_term(&q.o); + self.groups + .entry((id_id, gn_id)) + .or_default() + .push((s, p, o)); + Ok(()) + } + + /// Finish building and write the file to disk. + pub fn finalize(mut self) -> Result<()> { + // Ensure per-group SPO sort + for v in self.groups.values_mut() { + v.sort_unstable(); + } + + // Build buffers using the same logic as write_file_with_options + let mut file = vec![0u8; 32]; + let mut toc: Vec = Vec::new(); + + let id_sec = write_str_dict(&mut file, &self.id_vec)?; + toc.push(TocEntry { + kind: SectionKind::IdDict, + section: id_sec, + crc32_u32: 0, + }); + let gn_sec = write_str_dict(&mut file, &self.gn_vec)?; + toc.push(TocEntry { + kind: SectionKind::GNameDict, + section: gn_sec, + crc32_u32: 0, + }); + let term_sec = write_term_dict(&mut file, &self.term_vec)?; + toc.push(TocEntry { + kind: SectionKind::TermDict, + section: term_sec, + crc32_u32: 0, + }); + + let tb_off = file.len(); + let mut gid_rows: Vec = Vec::new(); + for ((id_id, gn_id), spo) in &self.groups { + let start = file.len(); + let raw = build_raw_spo(spo)?; + if self.opts.zstd { + #[cfg(feature = "zstd")] + { + file.push(1u8); + let compressed = zstd::encode_all(&raw[..], 0) + .map_err(|_| R5Error::Corrupt("zstd encode".into()))?; + file.extend_from_slice(&(compressed.len() as u32).to_le_bytes()); + file.extend_from_slice(&compressed); + } + #[cfg(not(feature = "zstd"))] + { + return Err(R5Error::Invalid("zstd feature not enabled")); + } + } else { + file.push(0u8); + file.extend_from_slice(&(raw.len() as u32).to_le_bytes()); + file.extend_from_slice(&raw); + } + let sec = Section { + off: start as u64, + len: (file.len() - start) as u64, + }; + let (n_s, n_p, n_t) = raw_counts(&raw)?; + gid_rows.push(( + *id_id, *gn_id, sec, n_t as u64, n_s as u32, n_p as u32, n_t as u32, + )); + } + let tb_sec = Section { + off: tb_off as u64, + len: (file.len() - tb_off) as u64, + }; + toc.push(TocEntry { + kind: SectionKind::TripleBlocks, + section: tb_sec, + crc32_u32: 0, + }); + + // GDIR + let gdir_off = file.len(); + let n_rows = gid_rows.len() as u64; + file.extend_from_slice(&n_rows.to_le_bytes()); + file.extend_from_slice(&44u32.to_le_bytes()); + file.extend_from_slice(&0u32.to_le_bytes()); + for (id_id, gn_id, sec, n_triples, n_s, n_p, n_o) in &gid_rows { + file.extend_from_slice(&id_id.to_le_bytes()); + file.extend_from_slice(&gn_id.to_le_bytes()); + file.extend_from_slice(&sec.off.to_le_bytes()); + file.extend_from_slice(&sec.len.to_le_bytes()); + file.extend_from_slice(&n_triples.to_le_bytes()); + file.extend_from_slice(&n_s.to_le_bytes()); + file.extend_from_slice(&n_p.to_le_bytes()); + file.extend_from_slice(&n_o.to_le_bytes()); + } + let gdir_sec = Section { + off: gdir_off as u64, + len: (file.len() - gdir_off) as u64, + }; + toc.push(TocEntry { + kind: SectionKind::GDir, + section: gdir_sec, + crc32_u32: 0, + }); + + // Postings and pair index + let mut pair_entries: Vec = Vec::new(); + let mut id2gids: Vec> = vec![Vec::new(); self.id_vec.len()]; + let mut gn2gids: Vec> = vec![Vec::new(); self.gn_vec.len()]; + for (gid, (id_id, gn_id, _, _, _, _, _)) in gid_rows.iter().enumerate() { + let gid_u = gid as u64; + id2gids[*id_id as usize].push(gid_u); + gn2gids[*gn_id as usize].push(gid_u); + pair_entries.push((*id_id, *gn_id, gid_u)); + } + pair_entries.sort_unstable(); + + let ididx_sec = write_postings_index(&mut file, &id2gids)?; + toc.push(TocEntry { + kind: SectionKind::IdxId2Gid, + section: ididx_sec, + crc32_u32: 0, + }); + let gnidx_sec = write_postings_index(&mut file, &gn2gids)?; + toc.push(TocEntry { + kind: SectionKind::IdxGName2Gid, + section: gnidx_sec, + crc32_u32: 0, + }); + let pairidx_sec = write_pair_index(&mut file, &pair_entries)?; + toc.push(TocEntry { + kind: SectionKind::IdxPair2Gid, + section: pairidx_sec, + crc32_u32: 0, + }); + + // TOC + let toc_off = file.len(); + for e in &toc { + let mut ent = [0u8; 32]; + let kind = e.kind as u16; + ent[0..2].copy_from_slice(&kind.to_le_bytes()); + ent[4..12].copy_from_slice(&e.section.off.to_le_bytes()); + ent[12..20].copy_from_slice(&e.section.len.to_le_bytes()); + if self.opts.with_crc { + let start = e.section.off as usize; + let end = start + e.section.len as usize; + let crc = crc32_ieee(&file[start..end]); + ent[20..24].copy_from_slice(&crc.to_le_bytes()); + } + file.extend_from_slice(&ent); + } + + // Header + file[0..4].copy_from_slice(b"R5TU"); + file[4..6].copy_from_slice(&1u16.to_le_bytes()); + let mut flags: u16 = 0; + if self.opts.zstd { + flags |= 1 << 1; + } + file[6..8].copy_from_slice(&flags.to_le_bytes()); + file[8..16].copy_from_slice(&0u64.to_le_bytes()); + file[16..24].copy_from_slice(&(toc_off as u64).to_le_bytes()); + file[24..28].copy_from_slice(&(toc.len() as u32).to_le_bytes()); + file[28..32].copy_from_slice(&0u32.to_le_bytes()); + + // Footer + let crc = crc32_ieee(&file[..]); + file.extend_from_slice(&crc.to_le_bytes()); + file.extend_from_slice(b"R5TU_ENDMARK"); + + // Write + let tmp = self.path.with_extension(".tmp.r5tu"); + fs::write(&tmp, &file).map_err(R5Error::Io)?; + fs::rename(&tmp, &self.path).map_err(R5Error::Io)?; + Ok(()) + } +} + +// ---------------- Oxigraph helpers ---------------- + +#[cfg(feature = "oxigraph")] +fn term_from_ox_term_ref(t: &oxigraph::model::TermRef<'_>) -> Term { + use oxigraph::model::TermRef as TR; + match t { + TR::NamedNode(n) => Term::Iri(n.as_str().to_string()), + TR::BlankNode(b) => Term::BNode(format!("_:{}", b.as_str())), + TR::Literal(l) => { + let lex = l.value().to_string(); + if let Some(lang) = l.language() { + Term::Literal { + lex, + dt: None, + lang: Some(lang.to_string()), + } + } else { + Term::Literal { + lex, + dt: Some(l.datatype().as_str().to_string()), + lang: None, + } + } + } + } +} + +#[cfg(feature = "oxigraph")] +impl StreamingWriter { + pub fn add_oxigraph_graph( + &mut self, + graph: &oxigraph::model::Graph, + id: &str, + gname: &str, + ) -> Result<()> { + use oxigraph::model::NamedOrBlankNodeRef; + for t in graph.iter() { + let s = match &t.subject { + NamedOrBlankNodeRef::NamedNode(n) => Term::Iri(n.as_str().to_string()), + NamedOrBlankNodeRef::BlankNode(b) => Term::BNode(format!("_:{}", b.as_str())), + }; + let p = Term::Iri(t.predicate.as_str().to_string()); + let o = term_from_ox_term_ref(&t.object); + self.add(Quint { + id: id.to_string(), + s, + p, + o, + gname: gname.to_string(), + })?; + } + Ok(()) + } +} + +#[cfg(feature = "oxigraph")] +pub fn write_graph_from_oxigraph>( + path: P, + graph: &oxigraph::model::Graph, + id: &str, + gname: &str, + opts: WriterOptions, +) -> Result<()> { + let mut w = StreamingWriter::new(path.as_ref(), opts); + w.add_oxigraph_graph(graph, id, gname)?; + w.finalize() +} + +#[cfg(feature = "oxigraph")] +pub fn detect_graphname_from_oxigraph(graph: &oxigraph::model::Graph) -> Option { + use oxigraph::model::{NamedNode, NamedOrBlankNodeRef, TermRef}; + let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").ok()?; + let owl_ontology = NamedNode::new("http://www.w3.org/2002/07/owl#Ontology").ok()?; + for t in graph.iter() { + if t.predicate == rdf_type.as_ref() && t.object == TermRef::NamedNode(owl_ontology.as_ref()) + { + return Some(match t.subject { + NamedOrBlankNodeRef::NamedNode(n) => n.as_str().to_string(), + NamedOrBlankNodeRef::BlankNode(b) => format!("_:{}", b.as_str()), + }); + } + } + None +} + +#[cfg(feature = "oxigraph")] +pub fn write_graph_from_oxigraph_auto>( + path: P, + graph: &oxigraph::model::Graph, + opts: WriterOptions, +) -> Result<()> { + let gname = detect_graphname_from_oxigraph(graph).unwrap_or_else(|| "default".to_string()); + write_graph_from_oxigraph(path, graph, "0", &gname, opts) +} + +#[cfg(feature = "oxigraph")] +pub fn detect_graphname_from_store(store: &oxigraph::store::Store) -> Option { + use oxigraph::model::{GraphNameRef, NamedNode, TermRef}; + let rdf_type = NamedNode::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").ok()?; + let owl_ontology = NamedNode::new("http://www.w3.org/2002/07/owl#Ontology").ok()?; + let mut it = store.quads_for_pattern( + None, + Some(rdf_type.as_ref()), + Some(TermRef::NamedNode(owl_ontology.as_ref())), + Some(GraphNameRef::DefaultGraph), + ); + if let Some(Ok(q)) = it.next() { + return Some(match &q.subject { + oxigraph::model::NamedOrBlankNode::NamedNode(n) => n.as_str().to_string(), + oxigraph::model::NamedOrBlankNode::BlankNode(b) => format!("_:{}", b.as_str()), + }); + } + None +} + +fn write_str_dict(buf: &mut Vec, strings: &[String]) -> Result
{ + let off = buf.len(); + // header 52 bytes + buf.resize(buf.len() + 52, 0); + let blob_off = buf.len(); + for s in strings { + buf.extend_from_slice(s.as_bytes()); + } + let blob_len = buf.len() - blob_off; + let offs_off = buf.len(); + // offs len = (n+1) * 4 + let mut cur = 0u32; + for s in strings { + buf.extend_from_slice(&cur.to_le_bytes()); + cur = cur + .checked_add(s.len() as u32) + .ok_or_else(|| R5Error::Corrupt("blob size".into()))?; + } + buf.extend_from_slice(&cur.to_le_bytes()); + let offs_len = buf.len() - offs_off; + // build coarse index (key16 + id + padding) entries sorted by key16 then id + let mut idx_entries: Vec<([u8; 16], u32)> = Vec::with_capacity(strings.len()); + for (i, s) in strings.iter().enumerate() { + let mut key = [0u8; 16]; + for (j, b) in s + .to_ascii_lowercase() + .as_bytes() + .iter() + .take(16) + .enumerate() + { + key[j] = *b; + } + idx_entries.push((key, i as u32)); + } + idx_entries.sort_unstable_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1))); + let idx_off; + let idx_len; + if !idx_entries.is_empty() { + idx_off = buf.len(); + for (key, id) in idx_entries { + buf.extend_from_slice(&key); + buf.extend_from_slice(&id.to_le_bytes()); + buf.extend_from_slice(&0u32.to_le_bytes()); // padding to 24 bytes + } + idx_len = buf.len() - idx_off; + } else { + idx_off = 0; + idx_len = 0; + } + // fill header + let n = strings.len() as u32; + buf[off..off + 4].copy_from_slice(&n.to_le_bytes()); + buf[off + 4..off + 12].copy_from_slice(&(blob_off as u64).to_le_bytes()); + buf[off + 12..off + 20].copy_from_slice(&(blob_len as u64).to_le_bytes()); + buf[off + 20..off + 28].copy_from_slice(&(offs_off as u64).to_le_bytes()); + buf[off + 28..off + 36].copy_from_slice(&(offs_len as u64).to_le_bytes()); + buf[off + 36..off + 44].copy_from_slice(&(idx_off as u64).to_le_bytes()); + buf[off + 44..off + 52].copy_from_slice(&(idx_len as u64).to_le_bytes()); + Ok(Section { + off: off as u64, + len: (buf.len() - off) as u64, + }) +} + +fn write_term_dict(buf: &mut Vec, terms: &[Term]) -> Result
{ + let off = buf.len(); + // header 33 bytes + buf.resize(buf.len() + 33, 0); + // kinds + let kinds_off = buf.len(); + for t in terms { + buf.push(match t { + Term::Iri(_) => 0, + Term::BNode(_) => 1, + Term::Literal { .. } => 2, + }); + } + // data blob + let data_off = buf.len(); + let mut offs: Vec = Vec::with_capacity(terms.len() + 1); + let mut cur: u64 = 0; + offs.push(cur); + for t in terms { + match t { + Term::Iri(s) | Term::BNode(s) => { + buf.extend_from_slice(s.as_bytes()); + cur += s.len() as u64; + } + Term::Literal { lex, dt, lang } => { + push_uvarint(lex.len() as u64, buf); + buf.extend_from_slice(lex.as_bytes()); + match dt { + Some(d) => { + buf.push(1); + push_uvarint(d.len() as u64, buf); + buf.extend_from_slice(d.as_bytes()); + } + None => buf.push(0), + } + match lang { + Some(l) => { + buf.push(1); + push_uvarint(l.len() as u64, buf); + buf.extend_from_slice(l.as_bytes()); + } + None => buf.push(0), + } + cur = (buf.len() - data_off) as u64; + } + } + offs.push(cur); + } + // offs u64*(n+1) + let offs_off = buf.len(); + for o in offs { + buf.extend_from_slice(&o.to_le_bytes()); + } + + // fill header + buf[off] = 0; // width + buf[off + 1..off + 9].copy_from_slice(&(terms.len() as u64).to_le_bytes()); + buf[off + 9..off + 17].copy_from_slice(&(kinds_off as u64).to_le_bytes()); + buf[off + 17..off + 25].copy_from_slice(&(data_off as u64).to_le_bytes()); + buf[off + 25..off + 33].copy_from_slice(&(offs_off as u64).to_le_bytes()); + Ok(Section { + off: off as u64, + len: (buf.len() - off) as u64, + }) +} + +fn build_raw_spo(spo: &[(u64, u64, u64)]) -> Result> { + // Precondition: spo sorted by (s,p,o) + let n_t = spo.len(); + let mut out = Vec::with_capacity(n_t * 2); + // collect unique S and P structure + let mut s_vals: Vec = Vec::new(); + let mut s_heads: Vec = Vec::new(); + let mut p_vals: Vec = Vec::new(); + let mut p_heads: Vec = Vec::new(); + let mut o_vals: Vec = Vec::new(); + + let mut i = 0usize; + while i < spo.len() { + let s = spo[i].0; + s_vals.push(s); + s_heads.push(p_vals.len() as u64); + // group by s + let mut j = i; + while j < spo.len() { + if spo[j].0 != s { + break; + } + // new p run + let p = spo[j].1; + p_vals.push(p); + p_heads.push(o_vals.len() as u64); + // group by (s,p) + let mut k = j; + while k < spo.len() && spo[k].0 == s && spo[k].1 == p { + o_vals.push(spo[k].2); + k += 1; + } + j = k; + } + i = j; + } + s_heads.push(p_vals.len() as u64); + p_heads.push(o_vals.len() as u64); + + // nS, nP, nT + push_uvarint(s_vals.len() as u64, &mut out); + push_uvarint(p_vals.len() as u64, &mut out); + push_uvarint(o_vals.len() as u64, &mut out); + // S_vals delta-coded + if !s_vals.is_empty() { + let mut prev = 0u64; + for (idx, v) in s_vals.iter().enumerate() { + if idx == 0 { + push_uvarint(*v, &mut out); + prev = *v; + } else { + push_uvarint( + v.checked_sub(prev) + .ok_or_else(|| R5Error::Corrupt("s delta underflow".into()))?, + &mut out, + ); + prev = *v; + } + } + } + // S_heads + for v in &s_heads { + push_uvarint(*v, &mut out); + } + // P_vals delta-coded per S-run + for s_idx in 0..s_vals.len() { + let start = s_heads[s_idx] as usize; + let end = s_heads[s_idx + 1] as usize; + if start == end { + continue; + } + let mut prev = 0u64; + for (i, idx) in (start..end).enumerate() { + let v = p_vals[idx]; + if i == 0 { + push_uvarint(v, &mut out); + prev = v; + } else { + push_uvarint( + v.checked_sub(prev) + .ok_or_else(|| R5Error::Corrupt("p delta underflow".into()))?, + &mut out, + ); + prev = v; + } + } + } + // P_heads + for v in &p_heads { + push_uvarint(*v, &mut out); + } + // O_vals delta-coded per (S,P)-run + for p_idx in 0..p_vals.len() { + let start = p_heads[p_idx] as usize; + let end = p_heads[p_idx + 1] as usize; + if start == end { + continue; + } + let mut prev = 0u64; + for (i, idx) in (start..end).enumerate() { + let v = o_vals[idx]; + if i == 0 { + push_uvarint(v, &mut out); + prev = v; + } else { + push_uvarint( + v.checked_sub(prev) + .ok_or_else(|| R5Error::Corrupt("o delta underflow".into()))?, + &mut out, + ); + prev = v; + } + } + } + Ok(out) +} + +fn raw_counts(raw: &[u8]) -> Result<(usize, usize, usize)> { + let (n_s, o1) = read_uvarint(raw, 0).ok_or_else(|| R5Error::Corrupt("nS".into()))?; + let (n_p, o2) = read_uvarint(raw, o1).ok_or_else(|| R5Error::Corrupt("nP".into()))?; + let (n_t, _) = read_uvarint(raw, o2).ok_or_else(|| R5Error::Corrupt("nT".into()))?; + Ok((n_s as usize, n_p as usize, n_t as usize)) +} + +fn write_postings_index(buf: &mut Vec, lists: &[Vec]) -> Result
{ + let off = buf.len(); + buf.resize(buf.len() + 24, 0); // header + let offs_off = buf.len(); + let mut cur = 0u64; + buf.extend_from_slice(&cur.to_le_bytes()); + let mut blob = Vec::new(); + for list in lists { + // encode list + if list.is_empty() { + push_uvarint(0, &mut blob); + } else { + push_uvarint(list.len() as u64, &mut blob); + push_uvarint(list[0], &mut blob); + for w in list.windows(2) { + push_uvarint(w[1] - w[0], &mut blob); + } + } + cur += blob.len() as u64 - cur; + buf.extend_from_slice(&(blob.len() as u64).to_le_bytes()); + } + let blob_off = buf.len(); + buf.extend_from_slice(&blob); + // fill header + buf[off..off + 8].copy_from_slice(&(lists.len() as u64).to_le_bytes()); + buf[off + 8..off + 16].copy_from_slice(&(offs_off as u64).to_le_bytes()); + buf[off + 16..off + 24].copy_from_slice(&(blob_off as u64).to_le_bytes()); + Ok(Section { + off: off as u64, + len: (buf.len() - off) as u64, + }) +} + +fn write_pair_index(buf: &mut Vec, pairs: &[(u32, u32, u64)]) -> Result
{ + let off = buf.len(); + buf.extend_from_slice(&(pairs.len() as u64).to_le_bytes()); + let pairs_off = buf.len() + 8; // we will place entries after writing pairs_off + buf.extend_from_slice(&(pairs_off as u64).to_le_bytes()); + for (id_id, gn_id, gid) in pairs { + buf.extend_from_slice(&id_id.to_le_bytes()); + buf.extend_from_slice(&gn_id.to_le_bytes()); + buf.extend_from_slice(&gid.to_le_bytes()); + } + Ok(Section { + off: off as u64, + len: (buf.len() - off) as u64, + }) +} + +fn read_uvarint(buf: &[u8], mut off: usize) -> Option<(u64, usize)> { + let (mut x, mut s) = (0u64, 0u32); + for _ in 0..10 { + let b = *buf.get(off)? as u64; + off += 1; + x |= (b & 0x7f) << s; + if b & 0x80 == 0 { + return Some((x, off)); + } + s += 7; + } + None +} diff --git a/rdf5d/tests/crc_mismatch.rs b/rdf5d/tests/crc_mismatch.rs new file mode 100644 index 0000000..23c0d79 --- /dev/null +++ b/rdf5d/tests/crc_mismatch.rs @@ -0,0 +1,35 @@ +use rdf5d::{ + reader::R5tuFile, + writer::{Quint, Term, write_file}, +}; + +#[test] +fn detects_global_crc_mismatch() { + let q = Quint { + id: "X".into(), + s: Term::Iri("http://ex/s".into()), + p: Term::Iri("http://ex/p".into()), + o: Term::Literal { + lex: "v".into(), + dt: None, + lang: None, + }, + gname: "g".into(), + }; + let mut path = std::env::temp_dir(); + path.push("crc_bad.r5tu"); + write_file(&path, &[q]).unwrap(); + + // Corrupt a byte in the middle of the file (but not the header magic) + let mut bytes = std::fs::read(&path).unwrap(); + let pos = 40.min(bytes.len() - 17); // before footer + bytes[pos] ^= 0xFF; // flip + std::fs::write(&path, &bytes).unwrap(); + + let err = R5tuFile::open(&path).unwrap_err(); + let _ = std::fs::remove_file(&path); + match err { + rdf5d::reader::R5Error::Corrupt(m) => assert!(m.contains("CRC")), + _ => panic!("expected CRC mismatch error"), + } +} diff --git a/rdf5d/tests/e2e_multi_graphs.rs b/rdf5d/tests/e2e_multi_graphs.rs new file mode 100644 index 0000000..a2317e0 --- /dev/null +++ b/rdf5d/tests/e2e_multi_graphs.rs @@ -0,0 +1,59 @@ +use rdf5d::{Quint, R5tuFile, StreamingWriter, Term, writer::WriterOptions}; + +#[test] +fn end_to_end_multiple_graphs_and_indexes() { + // Build three graphs across two ids and two graphnames + let qs = vec![ + Quint { + id: "src/A".into(), + gname: "g1".into(), + s: Term::Iri("ex:s1".into()), + p: Term::Iri("ex:p".into()), + o: Term::Iri("ex:o1".into()), + }, + Quint { + id: "src/A".into(), + gname: "g2".into(), + s: Term::Iri("ex:s2".into()), + p: Term::Iri("ex:p".into()), + o: Term::Iri("ex:o2".into()), + }, + Quint { + id: "src/B".into(), + gname: "g2".into(), + s: Term::Iri("ex:s3".into()), + p: Term::Iri("ex:p".into()), + o: Term::Iri("ex:o3".into()), + }, + ]; + + let mut path = std::env::temp_dir(); + path.push("e2e_multi.r5tu"); + let mut w = StreamingWriter::new( + &path, + WriterOptions { + zstd: false, + with_crc: true, + }, + ); + for q in qs { + w.add(q).unwrap(); + } + w.finalize().unwrap(); + + let f = R5tuFile::open(&path).unwrap(); + // by id + let a = f.enumerate_by_id("src/A").unwrap(); + assert_eq!(a.len(), 2); + // by graphname + let g2 = f.enumerate_by_graphname("g2").unwrap(); + assert_eq!(g2.len(), 2); + // resolve pair + let gr = f.resolve_gid("src/B", "g2").unwrap().unwrap(); + let v: Vec<_> = f.triples_ids(gr.gid).unwrap().collect(); + assert_eq!(v.len(), 1); + let (s, _, _) = v[0]; + assert_eq!(f.term_to_string(s).unwrap(), "ex:s3"); + + let _ = std::fs::remove_file(&path); +} diff --git a/rdf5d/tests/e2e_roundtrip_mmap.rs b/rdf5d/tests/e2e_roundtrip_mmap.rs new file mode 100644 index 0000000..39502c4 --- /dev/null +++ b/rdf5d/tests/e2e_roundtrip_mmap.rs @@ -0,0 +1,76 @@ +#![cfg(feature = "mmap")] +use rdf5d::{Quint, R5tuFile, StreamingWriter, Term, writer::WriterOptions}; + +#[test] +fn mmap_roundtrip_two_graphs() { + // Build two graphs under same graphname, different ids + let mut quints = Vec::new(); + // Graph A: two triples + quints.push(Quint { + id: "src/A".into(), + gname: "g".into(), + s: Term::Iri("http://ex/s1".into()), + p: Term::Iri("http://ex/p1".into()), + o: Term::Iri("http://ex/o1".into()), + }); + quints.push(Quint { + id: "src/A".into(), + gname: "g".into(), + s: Term::Iri("http://ex/s1".into()), + p: Term::Iri("http://ex/p2".into()), + o: Term::Literal { + lex: "v2".into(), + dt: None, + lang: Some("en".into()), + }, + }); + // Graph B: one triple + quints.push(Quint { + id: "src/B".into(), + gname: "g".into(), + s: Term::Iri("http://ex/s2".into()), + p: Term::Iri("http://ex/p1".into()), + o: Term::Literal { + lex: "42".into(), + dt: Some("http://www.w3.org/2001/XMLSchema#integer".into()), + lang: None, + }, + }); + + // Write file + let mut path = std::env::temp_dir(); + path.push("e2e_mmap.r5tu"); + let opts = WriterOptions { + zstd: false, + with_crc: true, + }; + let mut w = StreamingWriter::new(&path, opts); + for q in quints { + w.add(q).unwrap(); + } + w.finalize().unwrap(); + + // Open via mmap + let f = R5tuFile::open_mmap(&path).expect("open_mmap"); + + // enumerate_by_id + let a = f.enumerate_by_id("src/A").unwrap(); + assert_eq!(a.len(), 1); + assert_eq!(a[0].n_triples, 2); + // enumerate_by_graphname + let g = f.enumerate_by_graphname("g").unwrap(); + assert_eq!(g.len(), 2); + // resolve_gid + iterate + let gr_b = f.resolve_gid("src/B", "g").unwrap().unwrap(); + let ts_b: Vec<_> = f.triples_ids(gr_b.gid).unwrap().collect(); + assert_eq!(ts_b.len(), 1); + let (s, p, o) = ts_b[0]; + assert_eq!(f.term_to_string(s).unwrap(), "http://ex/s2"); + assert_eq!(f.term_to_string(p).unwrap(), "http://ex/p1"); + assert_eq!( + f.term_to_string(o).unwrap(), + "\"42\"^^" + ); + + let _ = std::fs::remove_file(&path); +} diff --git a/rdf5d/tests/edge_cases.rs b/rdf5d/tests/edge_cases.rs new file mode 100644 index 0000000..5d70884 --- /dev/null +++ b/rdf5d/tests/edge_cases.rs @@ -0,0 +1,137 @@ +use rdf5d::{ + StreamingWriter, + reader::R5tuFile, + writer::{Quint, Term, WriterOptions, write_file, write_file_with_options}, +}; + +fn mk_temp(name: &str) -> std::path::PathBuf { + let mut p = std::env::temp_dir(); + p.push(name); + p +} + +#[test] +fn empty_input_produces_valid_file() { + let path = mk_temp("empty.r5tu"); + let quints: Vec = Vec::new(); + write_file(&path, &quints).expect("write empty"); + let f = R5tuFile::open(&path).expect("open"); + assert!(!f.toc().is_empty()); // has sections + // Enumerations yield empty + assert!(f.enumerate_by_graphname("g").unwrap().is_empty()); + let _ = std::fs::remove_file(&path); +} + +#[test] +fn streaming_empty_finalize() { + let path = mk_temp("empty_stream.r5tu"); + let w = StreamingWriter::new( + &path, + WriterOptions { + zstd: false, + with_crc: true, + }, + ); + w.finalize().expect("finalize empty"); + let f = R5tuFile::open(&path).expect("open"); + assert!(!f.toc().is_empty()); + let _ = std::fs::remove_file(&path); +} + +#[test] +fn postings_monotonicity_and_spo_order() { + // Build three graphs under two ids to exercise postings + let s1 = Term::Iri("http://ex/s1".into()); + let s2 = Term::Iri("http://ex/s2".into()); + let p1 = Term::Iri("http://ex/p1".into()); + let p2 = Term::Iri("http://ex/p2".into()); + let o1 = Term::Literal { + lex: "v1".into(), + dt: None, + lang: None, + }; + let o2 = Term::Literal { + lex: "v2".into(), + dt: None, + lang: None, + }; + let o3 = Term::Literal { + lex: "v3".into(), + dt: None, + lang: None, + }; + let qs = vec![ + Quint { + id: "A".into(), + s: s1.clone(), + p: p1.clone(), + o: o1.clone(), + gname: "g".into(), + }, + Quint { + id: "A".into(), + s: s1.clone(), + p: p2.clone(), + o: o2.clone(), + gname: "g".into(), + }, + Quint { + id: "B".into(), + s: s2.clone(), + p: p1.clone(), + o: o3.clone(), + gname: "g".into(), + }, + ]; + let path = mk_temp("mono.r5tu"); + write_file_with_options( + &path, + &qs, + WriterOptions { + zstd: false, + with_crc: true, + }, + ) + .unwrap(); + let f = R5tuFile::open(&path).unwrap(); + // Postings monotonicity via enumerate_by_graphname("g"): gids must strictly increase + let mut last_gid = None; + for gr in f.enumerate_by_graphname("g").unwrap() { + if let Some(g) = last_gid { + assert!(gr.gid > g); + } + last_gid = Some(gr.gid); + // Check SPO order non-decreasing within block and counts match + let mut prev = None; + let mut count = 0u64; + for t in f.triples_ids(gr.gid).unwrap() { + if let Some(pp) = prev { + assert!(pp <= t); + } + prev = Some(t); + count += 1; + } + assert_eq!(count, gr.n_triples); + } + let _ = std::fs::remove_file(&path); +} + +#[test] +fn long_strings_and_lookup() { + let long = "a".repeat(128); + let q = Quint { + id: long.clone(), + s: Term::Iri("http://ex/s".into()), + p: Term::Iri("http://ex/p".into()), + o: Term::Iri("http://ex/o".into()), + gname: long.clone(), + }; + let path = mk_temp("longstrs.r5tu"); + write_file(&path, &[q]).unwrap(); + let f = R5tuFile::open(&path).unwrap(); + let by_id = f.enumerate_by_id(&long).unwrap(); + assert_eq!(by_id.len(), 1); + let by_g = f.enumerate_by_graphname(&long).unwrap(); + assert_eq!(by_g.len(), 1); + let _ = std::fs::remove_file(&path); +} diff --git a/rdf5d/tests/oxigraph_conv.rs b/rdf5d/tests/oxigraph_conv.rs new file mode 100644 index 0000000..78d42e9 --- /dev/null +++ b/rdf5d/tests/oxigraph_conv.rs @@ -0,0 +1,38 @@ +#![cfg(feature = "oxigraph")] +use rdf5d::{ + reader::R5tuFile, + writer::{Quint, Term, write_file}, +}; + +#[test] +fn to_oxigraph_graph_basic() { + let s1 = Term::Iri("http://ex/s1".into()); + let p1 = Term::Iri("http://ex/p1".into()); + let o1 = Term::Literal { + lex: "v1".into(), + dt: None, + lang: Some("en".into()), + }; + let q = Quint { + id: "src/A".into(), + s: s1, + p: p1, + o: o1, + gname: "g".into(), + }; + let mut path = std::env::temp_dir(); + path.push("oxigraph_conv.r5tu"); + write_file(&path, &[q]).unwrap(); + let f = R5tuFile::open(&path).unwrap(); + let gr = f.resolve_gid("src/A", "g").unwrap().unwrap(); + let g = f.to_oxigraph_graph(gr.gid).unwrap(); + assert_eq!(g.iter().count(), 1); + // Iterator over oxigraph triples + let triples: Vec<_> = f + .oxigraph_triples(gr.gid) + .unwrap() + .collect::>() + .unwrap(); + assert_eq!(triples.len(), 1); + let _ = std::fs::remove_file(&path); +} diff --git a/rdf5d/tests/roundtrip.rs b/rdf5d/tests/roundtrip.rs new file mode 100644 index 0000000..7a16f19 --- /dev/null +++ b/rdf5d/tests/roundtrip.rs @@ -0,0 +1,82 @@ +use rdf5d::{ + reader::R5tuFile, + writer::{Quint, Term, write_file}, +}; + +#[test] +fn writer_reader_roundtrip_two_graphs() { + // Build input quints (ids: src/A, src/B; graphname: g) + // Graph A: (s1,p1,o1), (s1,p2,o2) + // Graph B: (s2,p1,o3) + let s1 = Term::Iri("http://ex/s1".into()); + let s2 = Term::Iri("http://ex/s2".into()); + let p1 = Term::Iri("http://ex/p1".into()); + let p2 = Term::Iri("http://ex/p2".into()); + let o1 = Term::Literal { + lex: "v1".into(), + dt: None, + lang: None, + }; + let o2 = Term::Literal { + lex: "v2".into(), + dt: None, + lang: Some("en".into()), + }; + let o3 = Term::BNode("_:b3".into()); + + let quints = vec![ + Quint { + id: "src/A".into(), + s: s1.clone(), + p: p1.clone(), + o: o1.clone(), + gname: "g".into(), + }, + Quint { + id: "src/A".into(), + s: s1.clone(), + p: p2.clone(), + o: o2.clone(), + gname: "g".into(), + }, + Quint { + id: "src/B".into(), + s: s2.clone(), + p: p1.clone(), + o: o3.clone(), + gname: "g".into(), + }, + ]; + + let mut path = std::env::temp_dir(); + path.push("roundtrip.r5tu"); + write_file(&path, &quints).expect("write"); + + let f = R5tuFile::open(&path).expect("open"); + + // enumerate_by_id("src/A") + let v = f.enumerate_by_id("src/A").expect("enum id"); + assert_eq!(v.len(), 1); + assert_eq!(v[0].n_triples, 2); + + // enumerate_by_graphname("g") => 2 graphs + let w = f.enumerate_by_graphname("g").expect("enum g"); + assert_eq!(w.len(), 2); + assert!(w.iter().any(|gr| gr.id == "src/A")); + assert!(w.iter().any(|gr| gr.id == "src/B")); + + // resolve_gid("src/B","g") + let gr = f.resolve_gid("src/B", "g").expect("resolve").expect("some"); + let triples: Vec<_> = f.triples_ids(gr.gid).expect("triples").collect(); + assert_eq!(triples.len(), 1); + let (s, p, o) = triples[0]; + // term_to_string reproduces + let ss = f.term_to_string(s).expect("s"); + let pp = f.term_to_string(p).expect("p"); + let oo = f.term_to_string(o).expect("o"); + assert_eq!(ss, "http://ex/s2"); + assert_eq!(pp, "http://ex/p1"); + assert_eq!(oo, "_:b3"); + + let _ = std::fs::remove_file(&path); +} diff --git a/rdf5d/tests/roundtrip_zstd.rs b/rdf5d/tests/roundtrip_zstd.rs new file mode 100644 index 0000000..f7b46ea --- /dev/null +++ b/rdf5d/tests/roundtrip_zstd.rs @@ -0,0 +1,80 @@ +#![cfg(feature = "zstd")] +use rdf5d::{ + reader::R5tuFile, + writer::{Quint, Term, WriterOptions, write_file_with_options}, +}; + +#[test] +fn roundtrip_with_zstd_blocks_and_crc() { + // Build input quints for two graphs under the same graphname + let s1 = Term::Iri("http://ex/s1".into()); + let s2 = Term::Iri("http://ex/s2".into()); + let p1 = Term::Iri("http://ex/p1".into()); + let p2 = Term::Iri("http://ex/p2".into()); + let o1 = Term::Literal { + lex: "v1".into(), + dt: None, + lang: None, + }; + let o2 = Term::Literal { + lex: "v2".into(), + dt: None, + lang: Some("en".into()), + }; + let o3 = Term::BNode("_:b3".into()); + + let mut quints = Vec::new(); + quints.push(Quint { + id: "src/A".into(), + s: s1.clone(), + p: p1.clone(), + o: o1.clone(), + gname: "g".into(), + }); + quints.push(Quint { + id: "src/A".into(), + s: s1.clone(), + p: p2.clone(), + o: o2.clone(), + gname: "g".into(), + }); + quints.push(Quint { + id: "src/B".into(), + s: s2.clone(), + p: p1.clone(), + o: o3.clone(), + gname: "g".into(), + }); + + let opts = WriterOptions { + zstd: true, + with_crc: true, + }; + let mut path = std::env::temp_dir(); + path.push("roundtrip_zstd.r5tu"); + write_file_with_options(&path, &quints, opts).expect("write zstd file"); + + let f = R5tuFile::open(&path).expect("open"); + // flags bit1 should be set (zstd) + assert_eq!(f.header().flags_u16 & (1 << 1), 1 << 1); + + // enumerate_by_id("src/A") → 1 graph + let v = f.enumerate_by_id("src/A").expect("enum id"); + assert_eq!(v.len(), 1); + assert_eq!(v[0].n_triples, 2); + + // enumerate_by_graphname("g") → 2 graphs + let w = f.enumerate_by_graphname("g").expect("enum g"); + assert_eq!(w.len(), 2); + + // triples for src/B / g + let gr = f.resolve_gid("src/B", "g").expect("resolve").expect("some"); + let triples: Vec<_> = f.triples_ids(gr.gid).expect("triples").collect(); + assert_eq!(triples.len(), 1); + let (s, p, o) = triples[0]; + assert_eq!(f.term_to_string(s).unwrap(), "http://ex/s2"); + assert_eq!(f.term_to_string(p).unwrap(), "http://ex/p1"); + assert_eq!(f.term_to_string(o).unwrap(), "_:b3"); + + let _ = std::fs::remove_file(&path); +} diff --git a/rdf5d/tests/stream_writer.rs b/rdf5d/tests/stream_writer.rs new file mode 100644 index 0000000..f0bc3cd --- /dev/null +++ b/rdf5d/tests/stream_writer.rs @@ -0,0 +1,94 @@ +use rdf5d::{Quint, StreamingWriter, Term, reader::R5tuFile, writer::WriterOptions}; + +#[test] +fn streaming_writer_roundtrip_interleaved_order() { + let mut path = std::env::temp_dir(); + path.push("stream_roundtrip.r5tu"); + let opts = WriterOptions { + zstd: false, + with_crc: true, + }; + let mut w = StreamingWriter::new(&path, opts); + + // Intentionally interleave graphs and out-of-order SPO to ensure sorting at finalize + let s1 = Term::Iri("http://ex/s1".into()); + let s2 = Term::Iri("http://ex/s2".into()); + let p1 = Term::Iri("http://ex/p1".into()); + let p2 = Term::Iri("http://ex/p2".into()); + let o1 = Term::Literal { + lex: "v1".into(), + dt: None, + lang: None, + }; + let o2 = Term::Literal { + lex: "v2".into(), + dt: None, + lang: Some("en".into()), + }; + let o3 = Term::BNode("_:b3".into()); + + w.add(Quint { + id: "src/B".into(), + s: s2.clone(), + p: p1.clone(), + o: o3.clone(), + gname: "g".into(), + }) + .unwrap(); + w.add(Quint { + id: "src/A".into(), + s: s1.clone(), + p: p2.clone(), + o: o2.clone(), + gname: "g".into(), + }) + .unwrap(); + w.add(Quint { + id: "src/A".into(), + s: s1.clone(), + p: p1.clone(), + o: o1.clone(), + gname: "g".into(), + }) + .unwrap(); + + w.finalize().expect("finalize"); + + let f = R5tuFile::open(&path).expect("open"); + let v = f.enumerate_by_graphname("g").unwrap(); + assert_eq!(v.len(), 2); + // Graph A: verify both triples present via strings (order-agnostic) + let a = f.resolve_gid("src/A", "g").unwrap().unwrap(); + let triples_a: Vec<_> = f.triples_ids(a.gid).unwrap().collect(); + assert_eq!(triples_a.len(), 2); + let mut set_a = std::collections::HashSet::new(); + for (s, p, o) in triples_a { + set_a.insert(( + f.term_to_string(s).unwrap(), + f.term_to_string(p).unwrap(), + f.term_to_string(o).unwrap(), + )); + } + let mut expected_a = std::collections::HashSet::new(); + expected_a.insert(( + "http://ex/s1".to_string(), + "http://ex/p1".to_string(), + "\"v1\"".to_string(), + )); + expected_a.insert(( + "http://ex/s1".to_string(), + "http://ex/p2".to_string(), + "\"v2\"@en".to_string(), + )); + assert_eq!(set_a, expected_a); + // Graph B: verify single triple + let b = f.resolve_gid("src/B", "g").unwrap().unwrap(); + let triples_b: Vec<_> = f.triples_ids(b.gid).unwrap().collect(); + assert_eq!(triples_b.len(), 1); + let (s, p, o) = triples_b[0]; + assert_eq!(f.term_to_string(s).unwrap(), "http://ex/s2"); + assert_eq!(f.term_to_string(p).unwrap(), "http://ex/p1"); + assert_eq!(f.term_to_string(o).unwrap(), "_:b3"); + + let _ = std::fs::remove_file(&path); +} diff --git a/rdf5d/tests/update_graph.rs b/rdf5d/tests/update_graph.rs new file mode 100644 index 0000000..ac4b2e0 --- /dev/null +++ b/rdf5d/tests/update_graph.rs @@ -0,0 +1,80 @@ +use rdf5d::{ + reader::R5tuFile, + replace_graph, + writer::{Quint, Term, write_file}, +}; + +#[test] +fn replace_entire_graph_preserves_others() { + // Initial dataset: two graphs under the same graphname + let quints = vec![ + Quint { + id: "src/A".into(), + gname: "g".into(), + s: Term::Iri("ex:s1".into()), + p: Term::Iri("ex:p".into()), + o: Term::Iri("ex:o1".into()), + }, + Quint { + id: "src/B".into(), + gname: "g".into(), + s: Term::Iri("ex:s2".into()), + p: Term::Iri("ex:p".into()), + o: Term::Iri("ex:o2".into()), + }, + ]; + + let mut in_path = std::env::temp_dir(); + in_path.push("update_in.r5tu"); + write_file(&in_path, &quints).expect("write input"); + + // New content for graph (src/A, g) + let new_triples = vec![ + ( + Term::Iri("ex:s1".into()), + Term::Iri("ex:p2".into()), + Term::Literal { + lex: "v2".into(), + dt: None, + lang: Some("en".into()), + }, + ), + ( + Term::Iri("ex:s3".into()), + Term::Iri("ex:p3".into()), + Term::Iri("ex:o3".into()), + ), + ]; + + let mut out_path = std::env::temp_dir(); + out_path.push("update_out.r5tu"); + replace_graph(&in_path, &out_path, "src/A", "g", &new_triples).expect("replace ok"); + + // Validate output + let f = R5tuFile::open(&out_path).expect("open out"); + // Graphs by graphname still two + let gs = f.enumerate_by_graphname("g").expect("enum g"); + assert_eq!(gs.len(), 2); + + // src/A now has 2 triples with the new predicate/object + let a = f + .resolve_gid("src/A", "g") + .expect("resolve A") + .expect("some"); + let a_triples: Vec<_> = f.triples_ids(a.gid).expect("triples A").collect(); + assert_eq!(a_triples.len(), 2); + + // src/B remains unchanged + let b = f + .resolve_gid("src/B", "g") + .expect("resolve B") + .expect("some"); + let b_triples: Vec<_> = f.triples_ids(b.gid).expect("triples B").collect(); + assert_eq!(b_triples.len(), 1); + // Check subject string for B stayed the same + let (s_b, _, _) = b_triples[0]; + assert_eq!(f.term_to_string(s_b).unwrap(), "ex:s2"); + + let _ = std::fs::remove_file(&in_path); + let _ = std::fs::remove_file(&out_path); +}