diff --git a/Cargo.lock b/Cargo.lock
index 9ef2078e64..c4188c14cc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -445,6 +445,28 @@ dependencies = [
"regex-syntax 0.8.5",
]
+[[package]]
+name = "arroy"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
+dependencies = [
+ "bytemuck",
+ "byteorder",
+ "enum-iterator",
+ "heed",
+ "memmap2 0.9.5",
+ "nohash",
+ "ordered-float 4.6.0",
+ "page_size",
+ "rand 0.8.5",
+ "rayon",
+ "roaring",
+ "tempfile",
+ "thiserror 2.0.12",
+ "tracing",
+]
+
[[package]]
name = "ascii_utils"
version = "0.9.3"
@@ -1336,6 +1358,15 @@ dependencies = [
"crossbeam-utils",
]
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115"
+dependencies = [
+ "crossbeam-utils",
+]
+
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
@@ -2031,6 +2062,15 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
+[[package]]
+name = "doxygen-rs"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9"
+dependencies = [
+ "phf",
+]
+
[[package]]
name = "dyn-clone"
version = "1.0.19"
@@ -2078,6 +2118,26 @@ dependencies = [
"cfg-if",
]
+[[package]]
+name = "enum-iterator"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c280b9e6b3ae19e152d8e31cf47f18389781e119d4013a2a2bb0180e5facc635"
+dependencies = [
+ "enum-iterator-derive",
+]
+
+[[package]]
+name = "enum-iterator-derive"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.101",
+]
+
[[package]]
name = "enum_dispatch"
version = "0.3.13"
@@ -2345,7 +2405,7 @@ dependencies = [
"libc",
"log",
"rustversion",
- "windows",
+ "windows 0.58.0",
]
[[package]]
@@ -2531,6 +2591,44 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+[[package]]
+name = "heed"
+version = "0.22.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a56c94661ddfb51aa9cdfbf102cfcc340aa69267f95ebccc4af08d7c530d393"
+dependencies = [
+ "bitflags 2.9.0",
+ "byteorder",
+ "heed-traits",
+ "heed-types",
+ "libc",
+ "lmdb-master-sys",
+ "once_cell",
+ "page_size",
+ "serde",
+ "synchronoise",
+ "url",
+]
+
+[[package]]
+name = "heed-traits"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff"
+
+[[package]]
+name = "heed-types"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c255bdf46e07fb840d120a36dcc81f385140d7191c76a7391672675c01a55d"
+dependencies = [
+ "bincode",
+ "byteorder",
+ "heed-traits",
+ "serde",
+ "serde_json",
+]
+
[[package]]
name = "hermit-abi"
version = "0.3.9"
@@ -3145,6 +3243,17 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
+[[package]]
+name = "lmdb-master-sys"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "864808e0b19fb6dd3b70ba94ee671b82fce17554cf80aeb0a155c65bb08027df"
+dependencies = [
+ "cc",
+ "doxygen-rs",
+ "libc",
+]
+
[[package]]
name = "lock_api"
version = "0.4.12"
@@ -3514,6 +3623,12 @@ dependencies = [
"libc",
]
+[[package]]
+name = "nohash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0f889fb66f7acdf83442c35775764b51fed3c606ab9cee51500dbde2cf528ca"
+
[[package]]
name = "nom"
version = "7.1.3"
@@ -3524,6 +3639,15 @@ dependencies = [
"minimal-lexical",
]
+[[package]]
+name = "ntapi"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4"
+dependencies = [
+ "winapi",
+]
+
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
@@ -3640,6 +3764,25 @@ dependencies = [
"rustc-hash 2.1.1",
]
+[[package]]
+name = "objc2-core-foundation"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166"
+dependencies = [
+ "bitflags 2.9.0",
+]
+
+[[package]]
+name = "objc2-io-kit"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a"
+dependencies = [
+ "libc",
+ "objc2-core-foundation",
+]
+
[[package]]
name = "object"
version = "0.36.7"
@@ -3833,6 +3976,16 @@ dependencies = [
"stable_deref_trait",
]
+[[package]]
+name = "page_size"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da"
+dependencies = [
+ "libc",
+ "winapi",
+]
+
[[package]]
name = "parking_lot"
version = "0.12.3"
@@ -4016,6 +4169,7 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078"
dependencies = [
+ "phf_macros",
"phf_shared",
]
@@ -4039,6 +4193,19 @@ dependencies = [
"rand 0.8.5",
]
+[[package]]
+name = "phf_macros"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.101",
+]
+
[[package]]
name = "phf_shared"
version = "0.11.3"
@@ -4820,6 +4987,7 @@ dependencies = [
"arrow-ipc",
"arrow-json",
"arrow-schema",
+ "arroy",
"async-openai",
"async-trait",
"bigdecimal",
@@ -4838,6 +5006,7 @@ dependencies = [
"futures-util",
"glam",
"hashbrown 0.15.3",
+ "heed",
"indexmap 2.9.0",
"indoc",
"itertools 0.13.0",
@@ -4846,6 +5015,7 @@ dependencies = [
"memmap2 0.9.5",
"minijinja",
"minijinja-contrib",
+ "moka",
"neo4rs",
"num",
"num-bigint",
@@ -4884,6 +5054,7 @@ dependencies = [
"serde_json",
"streaming-stats",
"strsim",
+ "sysinfo",
"tantivy",
"tempfile",
"thiserror 2.0.12",
@@ -4937,6 +5108,7 @@ dependencies = [
"serde",
"sorted_vector_map",
"tempfile",
+ "tokio",
"tracing",
]
@@ -5903,6 +6075,15 @@ dependencies = [
"futures-core",
]
+[[package]]
+name = "synchronoise"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3dbc01390fc626ce8d1cffe3376ded2b72a11bb70e1c75f404a210e4daa4def2"
+dependencies = [
+ "crossbeam-queue",
+]
+
[[package]]
name = "synstructure"
version = "0.13.2"
@@ -5914,6 +6095,20 @@ dependencies = [
"syn 2.0.101",
]
+[[package]]
+name = "sysinfo"
+version = "0.35.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79251336d17c72d9762b8b54be4befe38d2db56fbbc0241396d70f173c39d47a"
+dependencies = [
+ "libc",
+ "memchr",
+ "ntapi",
+ "objc2-core-foundation",
+ "objc2-io-kit",
+ "windows 0.61.1",
+]
+
[[package]]
name = "tagptr"
version = "0.2.0"
@@ -6890,6 +7085,28 @@ dependencies = [
"windows-targets 0.52.6",
]
+[[package]]
+name = "windows"
+version = "0.61.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5ee8f3d025738cb02bad7868bbb5f8a6327501e870bf51f1b455b0a2454a419"
+dependencies = [
+ "windows-collections",
+ "windows-core 0.61.0",
+ "windows-future",
+ "windows-link",
+ "windows-numerics",
+]
+
+[[package]]
+name = "windows-collections"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8"
+dependencies = [
+ "windows-core 0.61.0",
+]
+
[[package]]
name = "windows-core"
version = "0.58.0"
@@ -6916,6 +7133,16 @@ dependencies = [
"windows-strings 0.4.0",
]
+[[package]]
+name = "windows-future"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a1d6bbefcb7b60acd19828e1bc965da6fcf18a7e39490c5f8be71e54a19ba32"
+dependencies = [
+ "windows-core 0.61.0",
+ "windows-link",
+]
+
[[package]]
name = "windows-implement"
version = "0.58.0"
@@ -6966,6 +7193,16 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
+[[package]]
+name = "windows-numerics"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1"
+dependencies = [
+ "windows-core 0.61.0",
+ "windows-link",
+]
+
[[package]]
name = "windows-registry"
version = "0.4.0"
diff --git a/Cargo.toml b/Cargo.toml
index 0138c07e9f..c249a716bb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -141,6 +141,9 @@ pest_derive = "2.7.8"
minijinja = "2.2.0"
minijinja-contrib = { version = "2.2.0", features = ["datetime"] }
datafusion = { version = "43.0.0" }
+arroy = "0.6.1"
+heed = "0.22.0"
+sysinfo = "0.35.1"
sqlparser = "0.51.0"
futures = "0.3"
arrow = { version = "=53.2.0" }
diff --git a/python/tests/test_base_install/test_graphql/misc/test_graphql_vectors.py b/python/tests/test_base_install/test_graphql/misc/test_graphql_vectors.py
index f346d1d1bc..57b2d4e1a5 100644
--- a/python/tests/test_base_install/test_graphql/misc/test_graphql_vectors.py
+++ b/python/tests/test_base_install/test_graphql/misc/test_graphql_vectors.py
@@ -14,28 +14,17 @@ def test_embedding():
def setup_graph(g):
- g.update_constant_properties({"name": "abb"})
g.add_node(1, "aab")
g.add_edge(1, "aab", "bbb")
def assert_correct_documents(client):
query = """{
- plugins {
- globalSearch(query: "aab", limit: 1) {
- entity {
- __typename
- ... on DocumentGraph {
- name
- }
- }
- content
- embedding
- }
- }
vectorisedGraph(path: "abb") {
- algorithms {
- similaritySearch(query:"ab", limit: 1) {
+ entitiesBySimilarity(query: "aab", limit: 1) {
+ getDocuments {
+ content
+ embedding
entity {
__typename
... on Node {
@@ -50,26 +39,15 @@ def assert_correct_documents(client):
}
}
}
- content
- embedding
}
}
}
}"""
result = client.query(query)
assert result == {
- "plugins": {
- "globalSearch": [
- {
- "entity": {"__typename": "DocumentGraph", "name": "abb"},
- "content": "abb",
- "embedding": [1.0, 2.0],
- },
- ],
- },
"vectorisedGraph": {
- "algorithms": {
- "similaritySearch": [
+ "entitiesBySimilarity": {
+ "getDocuments": [
{
"entity": {"__typename": "Node", "name": "aab"},
"content": "aab",
@@ -87,7 +65,6 @@ def setup_server(work_dir):
cache="/tmp/graph-cache",
embedding=embedding,
nodes="{{ name }}",
- graphs="{{ properties.name }}",
edges=False,
)
return server
@@ -130,7 +107,3 @@ def test_include_graph():
with server.start():
client = RaphtoryClient("http://localhost:1736")
assert_correct_documents(client)
-
-
-test_upload_graph()
-test_include_graph()
diff --git a/python/tests/test_base_install/test_vectors.py b/python/tests/test_base_install/test_vectors.py
index 2cafa316fd..9eb455eae4 100644
--- a/python/tests/test_base_install/test_vectors.py
+++ b/python/tests/test_base_install/test_vectors.py
@@ -27,11 +27,11 @@ def floats_are_equals(float1: float, float2: float) -> bool:
return float1 + 0.001 > float2 and float1 - 0.001 < float2
-# the graph generated by this function looks like this (entities labeled with (2d) have 2 documents):
+# the graph generated by this function looks like this:
#
-# edge1 (2d)
+# edge1
# ╭─────── node2
-# node1 (2d)
+# node1
# ╰─────── node3 ───── node4
# edge2 edge3
#
@@ -48,9 +48,7 @@ def create_graph() -> VectorisedGraph:
g.add_edge(3, "node1", "node3", {"name": "edge2"})
g.add_edge(4, "node3", "node4", {"name": "edge3"})
- vg = g.vectorise(
- embedding, nodes="{{ name }}", edges="{{ properties.name }}", graph=False
- )
+ vg = g.vectorise(embedding, nodes="{{ name }}", edges="{{ properties.name }}")
return vg
@@ -58,6 +56,16 @@ def create_graph() -> VectorisedGraph:
def test_selection():
vg = create_graph()
+ ################################
+ selection = vg.empty_selection()
+ nodes_to_select = ["node1", "node2"]
+ edges_to_select = [("node1", "node2"), ("node1", "node3")]
+ selection = vg.empty_selection()
+ selection.add_nodes(nodes_to_select)
+ selection.add_edges(edges_to_select)
+ nodes = selection.nodes()
+ ###########################
+
assert len(vg.empty_selection().get_documents()) == 0
assert len(vg.empty_selection().get_documents_with_scores()) == 0
@@ -105,21 +113,9 @@ def test_search():
assert edge_names_returned == [("node1", "node2")]
# TODO: same for edges ?
- (doc1, score1), (doc2, score2) = vg.documents_by_similarity(
- [1.0, 0.0, 0.0], 2
- ).get_documents_with_scores()
+ [(doc1, score1)] = vg.entities_by_similarity("node1", 1).get_documents_with_scores()
assert floats_are_equals(score1, 1.0)
assert (doc1.entity.name, doc1.content) == ("node1", "node1")
- assert (doc2.entity.src.name, doc2.entity.dst.name) == ("node1", "node2")
-
- [(doc1, score1)] = vg.entities_by_similarity(
- [1.0, 0.0, 0.0], 1
- ).get_documents_with_scores()
- assert floats_are_equals(score1, 1.0)
- assert (doc1.entity.name, doc1.content) == ("node1", "node1")
-
- docs = vg.documents_by_similarity([0.0, 0.0, 1.1], 3).get_documents()
- assert [doc.content for doc in docs] == ["node3", "edge3", "edge2"]
# chained search
node_selection = vg.nodes_by_similarity("node2", 1)
@@ -183,21 +179,12 @@ def test_windows():
contents = [doc.content for doc in selection.get_documents()]
assert contents == ["node1", "edge1", "node2"]
- selection.expand_documents_by_similarity("edge2", 100, (0, 4))
- contents = [doc.content for doc in selection.get_documents()]
- assert contents == ["node1", "edge1", "node2", "edge2", "node3"]
-
- # this should leave the selection unchanged
- selection.expand_documents_by_similarity("node1", 100, (20, 100))
- contents = [doc.content for doc in selection.get_documents()]
- assert contents == ["node1", "edge1", "node2", "edge2", "node3"]
-
- # this should also leave the selection unchanged
- selection.expand_entities_by_similarity("node1", 100, (20, 100))
+ # this leave the selection unchanged, as only edge3 and node4 exist
+ selection.expand_entities_by_similarity("node1", 100, (4, 100))
contents = [doc.content for doc in selection.get_documents()]
- assert contents == ["node1", "edge1", "node2", "edge2", "node3"]
+ assert contents == ["node1", "edge1", "node2"]
- selection.expand(10, (4, 100))
+ selection.expand(10, (3, 100))
contents = [doc.content for doc in selection.get_documents()]
assert contents == ["node1", "edge1", "node2", "edge2", "node3", "edge3", "node4"]
@@ -219,7 +206,7 @@ def test_filtering_by_entity_type():
def constant_embedding(texts):
- return [[1, 0, 0] for text in texts]
+ return [[1.0, 0.0, 0.0] for text in texts]
def test_default_template():
@@ -239,223 +226,3 @@ def test_default_template():
edge_docs[0].content
== "There is an edge from node1 to node1 with events at:\n- Jan 1 1970 00:00\n"
)
-
-
-### MULTI-DOCUMENT VERSION TO BE RE-ENABLED
-
-# from raphtory import Graph
-# from raphtory.vectors import VectorisedGraph
-
-# embedding_map = {
-# "node1": [1.0, 0.0, 0.0],
-# "node2": [0.0, 1.0, 0.0],
-# "node3": [0.0, 0.0, 1.0],
-# "node4": [1.0, 1.0, 0.0],
-# "edge1": [1.0, 0.1, 0.0],
-# "edge2": [0.0, 1.0, 0.1],
-# "edge3": [0.0, 1.0, 1.0],
-# "node1-extra": [0.0, 1.0, 1.0],
-# "edge1-extra": [0.1, 1.0, 0.0],
-# }
-
-
-# def single_embedding(text: str):
-# try:
-# return embedding_map[text]
-# except:
-# raise Exception(f"unexpected document content: {text}")
-
-
-# def embedding(texts):
-# return [single_embedding(text) for text in texts]
-
-
-# def floats_are_equals(float1: float, float2: float) -> bool:
-# return float1 + 0.001 > float2 and float1 - 0.001 < float2
-
-
-# # the graph generated by this function looks like this (entities labeled with (2d) have 2 documents):
-# #
-# # edge1 (2d)
-# # ╭─────── node2
-# # node1 (2d)
-# # ╰─────── node3 ───── node4
-# # edge2 edge3
-# #
-# #
-# def create_graph() -> VectorisedGraph:
-# g = Graph()
-
-# g.add_node(1, "node1", {"doc": ["node1", "node1-extra"]}) # multi-document node
-# g.add_node(2, "node2", {"doc": ["node2"]})
-# g.add_node(3, "node3", {"doc": ["node3"]})
-# g.add_node(4, "node4", {"doc": ["node4"]})
-
-# g.add_edge(2, "node1", "node2", {"doc": ["edge1", "edge1-extra"]}) # multi-document edge
-# g.add_edge(3, "node1", "node3", {"doc": ["edge2"]})
-# g.add_edge(4, "node3", "node4", {"doc": ["edge3"]})
-
-# vg = g.vectorise(embedding, node_document="doc", edge_document="doc")
-
-# return vg
-
-
-# def test_selection():
-# vg = create_graph()
-
-# assert len(vg.empty_selection().get_documents()) == 0
-# assert len(vg.empty_selection().get_documents_with_scores()) == 0
-
-# nodes_to_select = ["node1", "node2"]
-# edges_to_select = [("node1", "node2"), ("node1", "node3")]
-
-# selection = vg.empty_selection()
-# selection.add_nodes(nodes_to_select)
-# nodes = selection.nodes()
-# node_names_returned = [node.name for node in nodes]
-# assert node_names_returned == nodes_to_select
-# docs = [doc.content for doc in selection.get_documents()]
-# assert docs == ["node1", "node1-extra", "node2"]
-
-# selection = vg.empty_selection()
-# selection.add_edges(edges_to_select)
-# edges = selection.edges()
-# edge_names_returned = [(edge.src.name, edge.dst.name) for edge in edges]
-# assert edge_names_returned == edges_to_select
-# docs = [doc.content for doc in selection.get_documents()]
-# assert docs == ["edge1", "edge1-extra", "edge2"]
-
-# edge_tuples = [(edge.src, edge.dst) for edge in edges]
-# selection = vg.empty_selection()
-# selection.add_nodes(nodes)
-# selection.add_edges(edge_tuples)
-# nodes_returned = selection.nodes()
-# assert nodes == nodes_returned
-# edges_returned = selection.edges()
-# assert edges == edges_returned
-
-
-# def test_search():
-# vg = create_graph()
-
-# assert len(vg.edges_by_similarity("edge1", 10).nodes()) == 0
-# assert len(vg.nodes_by_similarity("node1", 10).edges()) == 0
-
-# selection = vg.nodes_by_similarity([1.0, 0.0, 0.0], 1)
-# assert [node.name for node in selection.nodes()] == ["node1"]
-# assert [doc.content for doc in selection.get_documents()] == ["node1", "node1-extra"]
-
-# edges = vg.edges_by_similarity([1.0, 0.0, 0.0], 1).edges()
-# edge_names_returned = [(edge.src.name, edge.dst.name) for edge in edges]
-# assert edge_names_returned == [("node1", "node2")]
-# # TODO: same for edges ?
-
-# (doc1, score1), (doc2, score2) = vg.documents_by_similarity(
-# [1.0, 0.0, 0.0], 2
-# ).get_documents_with_scores()
-# assert floats_are_equals(score1, 1.0)
-# assert (doc1.entity.name, doc1.content) == ("node1", "node1")
-# assert (doc2.entity.src.name, doc2.entity.dst.name) == ("node1", "node2")
-
-# (doc1, score1), (doc2, score2) = vg.entities_by_similarity(
-# [1.0, 0.0, 0.0], 1
-# ).get_documents_with_scores()
-# assert floats_are_equals(score1, 1.0)
-# assert (doc1.entity.name, doc1.content) == ("node1", "node1")
-# assert (doc2.entity.name, doc2.content) == ("node1", "node1-extra")
-
-# docs = vg.documents_by_similarity([0.0, 0.0, 1.1], 3).get_documents()
-# assert [doc.content for doc in docs] == ["node3", "node1-extra", "edge3"]
-
-# # chained search
-# node_selection = vg.nodes_by_similarity("node2", 1);
-# edge_selection = vg.edges_by_similarity("node3", 1);
-# entity_selection = vg.entities_by_similarity("node1", 4);
-# docs = node_selection.join(edge_selection).join(entity_selection).get_documents()[:4]
-# # assert [doc.content for doc in docs] == ['node2', 'edge3', 'node1', 'edge1']
-# assert [doc.content for doc in docs] == ["node2", "edge3", "node1", "node1-extra"]
-# # the intention of this test was getting all the documents of for different entities,
-# # including at least node and one edge at the top.
-# # However, we don't have a way currently of taking the documents of the first N entities
-# # we could have a method selection.limit_entities()
-# # or we could also have a method entity.get_documents for the entities we return (not trivial)
-
-
-# def test_expansion():
-# vg = create_graph()
-
-# selection = vg.entities_by_similarity("node1", 1)
-# selection.expand(2)
-# assert len(selection.get_documents()) == 7
-# assert len(selection.nodes()) == 3
-# assert len(selection.edges()) == 2
-
-# selection = vg.entities_by_similarity("node1", 1)
-# selection.expand_entities_by_similarity("edge1", 1)
-# selection.expand_entities_by_similarity("node2", 1)
-# assert len(selection.get_documents()) == 5
-# nodes = selection.nodes()
-# node_names_returned = [node.name for node in nodes]
-# assert node_names_returned == ["node1", "node2"]
-# edges = selection.edges()
-# edge_names_returned = [(edge.src.name, edge.dst.name) for edge in edges]
-# assert edge_names_returned == [("node1", "node2")]
-
-# selection = vg.empty_selection()
-# selection.expand_entities_by_similarity("node3", 10)
-# assert len(selection.get_documents()) == 0
-
-# selection = vg.entities_by_similarity("node1", 1)
-# selection.expand_entities_by_similarity("node3", 10)
-# assert len(selection.get_documents()) == 9
-# assert len(selection.nodes()) == 4
-# assert len(selection.edges()) == 3
-# # TODO: add some expand_documents here
-
-
-# def test_windows():
-# vg = create_graph()
-
-# selection = vg.nodes_by_similarity("node1", 1, (4, 5))
-# assert [doc.content for doc in selection.get_documents()] == ["node4"]
-
-# selection = vg.nodes_by_similarity("node4", 1, (1, 2))
-# assert [doc.content for doc in selection.get_documents()] == ["node1", "node1-extra"]
-
-# selection.expand(10, (0, 3))
-# contents = [doc.content for doc in selection.get_documents()]
-# assert contents == ["node1", "node1-extra", "edge1", "edge1-extra", "node2"]
-
-# selection.expand_documents_by_similarity("edge2", 100, (0, 4))
-# contents = [doc.content for doc in selection.get_documents()]
-# assert contents == ["node1", "node1-extra", "edge1", "edge1-extra", "node2", "edge2", "node3"]
-
-# # this should leave the selection unchanged
-# selection.expand_documents_by_similarity("node1", 100, (20, 100))
-# contents = [doc.content for doc in selection.get_documents()]
-# assert contents == ["node1", "node1-extra", "edge1", "edge1-extra", "node2", "edge2", "node3"]
-
-# # this should also leave the selection unchanged
-# selection.expand_entities_by_similarity("node1", 100, (20, 100))
-# contents = [doc.content for doc in selection.get_documents()]
-# assert contents == ["node1", "node1-extra", "edge1", "edge1-extra", "node2", "edge2", "node3"]
-
-# selection.expand(10, (4, 100))
-# contents = [doc.content for doc in selection.get_documents()]
-# assert contents == ["node1", "node1-extra", "edge1", "edge1-extra", "node2", "edge2", "node3", "edge3", "node4"]
-
-
-# def test_filtering_by_entity_type():
-# vg = create_graph()
-
-# selection = vg.empty_selection()
-# selection.add_nodes(["node1"])
-# selection.expand_nodes_by_similarity("node2", 10)
-# contents = [doc.content for doc in selection.get_documents()]
-# assert contents == ["node1", "node1-extra", "node2", "node3", "node4"]
-
-# selection = vg.empty_selection()
-# selection.add_edges([("node1", "node2")])
-# selection.expand_edges_by_similarity("edge3", 10)
-# contents = [doc.content for doc in selection.get_documents()]
-# assert contents == ["edge1", "edge1-extra", "edge2", "edge3"]
diff --git a/raphtory-benchmark/Cargo.toml b/raphtory-benchmark/Cargo.toml
index 9e3589e2f8..9b21e3a42a 100644
--- a/raphtory-benchmark/Cargo.toml
+++ b/raphtory-benchmark/Cargo.toml
@@ -7,19 +7,28 @@ edition = "2021"
[dependencies]
criterion = { workspace = true }
-raphtory = { path = "../raphtory", features = ["io", "proto"], version = "0.15.1" }
+raphtory = { path = "../raphtory", features = [
+ "io",
+ "proto",
+ "vectors",
+], version = "0.15.1" }
raphtory-api = { path = "../raphtory-api", version = "0.15.1" }
sorted_vector_map = { workspace = true }
rand = { workspace = true }
rayon = { workspace = true }
tempfile = { workspace = true }
-tracing = {workspace = true}
+tracing = { workspace = true }
once_cell = { workspace = true }
-serde = { workspace = true }
-itertools = { workspace = true }
+serde = { workspace = true }
+itertools = { workspace = true }
fake = { workspace = true }
csv = { workspace = true }
chrono = { workspace = true }
+tokio = { workspace = true }
+
+[[bin]]
+name = "vectorise"
+path = "bin/vectorise.rs"
[[bench]]
name = "tgraph_benchmarks"
@@ -66,5 +75,9 @@ name = "search_bench"
harness = false
required-features = ["search"]
+[[bench]]
+name = "vectors"
+harness = false
+
[features]
search = ["raphtory/search"]
diff --git a/raphtory-benchmark/benches/vectors.rs b/raphtory-benchmark/benches/vectors.rs
new file mode 100644
index 0000000000..19d421bb3e
--- /dev/null
+++ b/raphtory-benchmark/benches/vectors.rs
@@ -0,0 +1,18 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use raphtory_benchmark::common::vectors::{
+ create_graph_for_vector_bench, gen_embedding_for_bench, vectorise_graph_for_bench,
+};
+
+fn bench_search_entities(c: &mut Criterion) {
+ let g = create_graph_for_vector_bench(100_000);
+ let v = vectorise_graph_for_bench(g);
+
+ let query = gen_embedding_for_bench("0");
+ c.bench_function("semantic_search_entities", |b| {
+ b.iter(|| v.entities_by_similarity(&query, 10, None))
+ });
+}
+
+criterion_group!(vector_benches, bench_search_entities,);
+criterion_main!(vector_benches);
diff --git a/raphtory-benchmark/bin/vectorise.rs b/raphtory-benchmark/bin/vectorise.rs
new file mode 100644
index 0000000000..e69b37e2ed
--- /dev/null
+++ b/raphtory-benchmark/bin/vectorise.rs
@@ -0,0 +1,19 @@
+use std::time::SystemTime;
+
+use raphtory_benchmark::common::vectors::{
+ create_graph_for_vector_bench, vectorise_graph_for_bench,
+};
+
+fn print_time(start: SystemTime, message: &str) {
+ let duration = SystemTime::now().duration_since(start).unwrap().as_secs();
+ println!("{message} - took {duration}s");
+}
+
+fn main() {
+ for size in [1_000_000] {
+ let graph = create_graph_for_vector_bench(size);
+ let start = SystemTime::now();
+ vectorise_graph_for_bench(graph);
+ print_time(start, &format!(">>> vectorise {}k", size / 1000));
+ }
+}
diff --git a/raphtory-benchmark/src/common/mod.rs b/raphtory-benchmark/src/common/mod.rs
index caf4247db4..3c2a0f2e43 100644
--- a/raphtory-benchmark/src/common/mod.rs
+++ b/raphtory-benchmark/src/common/mod.rs
@@ -1,5 +1,7 @@
#![allow(dead_code)]
+pub mod vectors;
+
use criterion::{
black_box, measurement::WallTime, BatchSize, Bencher, BenchmarkGroup, BenchmarkId, Criterion,
};
diff --git a/raphtory-benchmark/src/common/vectors.rs b/raphtory-benchmark/src/common/vectors.rs
new file mode 100644
index 0000000000..94456443a8
--- /dev/null
+++ b/raphtory-benchmark/src/common/vectors.rs
@@ -0,0 +1,46 @@
+use std::hash::{DefaultHasher, Hash, Hasher};
+
+use rand::{rngs::StdRng, Rng, SeedableRng};
+use raphtory::{
+ prelude::{AdditionOps, Graph, NO_PROPS},
+ vectors::{
+ cache::VectorCache, embeddings::EmbeddingResult, template::DocumentTemplate,
+ vectorisable::Vectorisable, vectorised_graph::VectorisedGraph, Embedding,
+ },
+};
+use tokio::runtime::Runtime;
+
+pub fn gen_embedding_for_bench(text: &str) -> Embedding {
+ let mut hasher = DefaultHasher::new();
+ text.hash(&mut hasher);
+ let hash = hasher.finish();
+
+ let mut rng: StdRng = SeedableRng::seed_from_u64(hash);
+ (0..1024).map(|_| rng.gen()).collect()
+}
+
+async fn embedding_model(texts: Vec) -> EmbeddingResult> {
+ Ok(texts
+ .iter()
+ .map(|text| gen_embedding_for_bench(text))
+ .collect())
+}
+
+pub fn create_graph_for_vector_bench(size: usize) -> Graph {
+ let graph = Graph::new();
+ for id in 0..size {
+ graph.add_node(0, id as u64, NO_PROPS, None).unwrap();
+ }
+ graph
+}
+
+pub fn vectorise_graph_for_bench(graph: Graph) -> VectorisedGraph {
+ let cache = VectorCache::in_memory(embedding_model);
+ let template = DocumentTemplate {
+ node_template: Some("{{name}}".to_owned()),
+ edge_template: None,
+ };
+ let rt = Runtime::new().unwrap();
+ rt.block_on(graph.vectorise(cache, template, None, true))
+ .unwrap()
+}
diff --git a/raphtory-graphql/src/data.rs b/raphtory-graphql/src/data.rs
index 9bc5eccfbd..9bc0c3c136 100644
--- a/raphtory-graphql/src/data.rs
+++ b/raphtory-graphql/src/data.rs
@@ -1,18 +1,17 @@
use crate::{
config::app_config::AppConfig,
graph::GraphWithVectors,
- model::plugins::query_plugin::QueryPlugin,
paths::{valid_path, ExistingGraphFolder, ValidGraphFolder},
};
use itertools::Itertools;
use moka::sync::Cache;
use raphtory::{
- core::utils::errors::{GraphError, GraphResult, InvalidPathReason},
+ core::utils::errors::{GraphError, InvalidPathReason},
db::api::view::MaterializedGraph,
+ prelude::CacheOps,
vectors::{
- embedding_cache::EmbeddingCache, embeddings::openai_embedding, template::DocumentTemplate,
- vectorisable::Vectorisable, vectorised_graph::VectorisedGraph, Embedding,
- EmbeddingFunction,
+ cache::VectorCache, embeddings::openai_embedding, template::DocumentTemplate,
+ vectorisable::Vectorisable, vectorised_graph::VectorisedGraph,
},
};
use std::{
@@ -27,8 +26,7 @@ use walkdir::WalkDir;
#[derive(Clone)]
pub struct EmbeddingConf {
- pub(crate) function: Arc,
- pub(crate) cache: Arc