Skip to content

Commit d6083c3

Browse files
authored
Merge pull request #210 from amosproj/develop
Develop
2 parents 9b35018 + 3247592 commit d6083c3

35 files changed

+1034
-1892
lines changed

Project/backend/codebase/graph_analysis/graph_analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def analyze_graph_structure(G):
108108
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
109109
- Closeness Centrality show the average distance of a node to all other nodes in the network
110110
"""
111-
n = 20 # Number of top nodes to return
111+
n = 20 if num_nodes > 20 else 5 # Number of top nodes to return
112112
# Calculate centrality measures
113113
degree_centrality = get_top_n_central_nodes(nx.degree_centrality(G), n)
114114
betweenness_centrality = get_top_n_central_nodes(nx.betweenness_centrality(G), n)

Project/backend/codebase/graph_creator/embedding_handler.py

Lines changed: 198 additions & 281 deletions
Large diffs are not rendered by default.

Project/backend/codebase/graph_creator/graph_creator_main.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
2-
2+
from graph_creator.embedding_handler import embeddings_handler
33
from graph_creator import graph_handler
4+
import os
45
from graph_creator.services.llm.llama_gemini_combination import llama_gemini_combination
56
from graph_creator.models.graph_job import GraphJob
67
from graph_creator.services import netx_graphdb
@@ -88,6 +89,12 @@ def create_and_store_graph(uuid, entities_and_relations, chunks, llm_handler):
8889
chunks[i] = chunks[i].dict()
8990
combined = graph_handler.connect_with_llm(df_e_and_r, chunks, llm_handler)
9091

92+
# Create an instance of the embeddings handler
93+
embeddings_handler_instance = embeddings_handler(GraphJob(id=uuid))
94+
95+
# Generate embeddings and merge duplicates
96+
combined = embeddings_handler_instance.generate_embeddings_and_merge_duplicates(combined)
97+
9198
# get graph db service
9299
graph_db_service = netx_graphdb.NetXGraphDB()
93100

Project/backend/codebase/graph_creator/graph_handler.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ def index_entity_relation_table(entity_and_relation_df, entities):
134134
entities_dict[entities[i]] = i
135135

136136
relations = []
137-
for i, row in entity_and_relation_df.iterrows():
137+
entity_and_relation_df_withoutna = entity_and_relation_df.dropna()
138+
for i, row in entity_and_relation_df_withoutna.iterrows():
138139
relations.append([entities_dict[row["node_1"]], entities_dict[row["node_2"]]])
139140

140141
return entities_dict, relations
@@ -213,7 +214,8 @@ def get_entities_by_chunk(entity_and_relation_df, entities_dict):
213214
A dictionary containing all entities per chunk as ids
214215
"""
215216
entities_by_chunk = {}
216-
for i, row in entity_and_relation_df.iterrows():
217+
entity_and_relation_df_withoutna = entity_and_relation_df.dropna()
218+
for i, row in entity_and_relation_df_withoutna.iterrows():
217219
if row["chunk_id"] in entities_by_chunk:
218220
entities_by_chunk[row["chunk_id"]].append(entities_dict[row["node_1"]])
219221
entities_by_chunk[row["chunk_id"]].append(entities_dict[row["node_2"]])
@@ -333,15 +335,18 @@ def add_relations_to_data(entity_and_relation_df, new_relations):
333335
334336
"""
335337
for relation in new_relations:
336-
node_1 = relation["node_1"]
337-
node_2 = relation["node_2"]
338-
edge = relation["edge"]
339-
chunk_id = relation["chunk_id"]
340-
341-
pos = len(entity_and_relation_df.index)
342-
entity_and_relation_df.loc[pos] = [node_1, node_2, edge, chunk_id]
343-
344-
return entity_and_relation_df
338+
try:
339+
node_1 = relation["node_1"]
340+
node_2 = relation["node_2"]
341+
edge = relation["edge"]
342+
chunk_id = relation["chunk_id"]
343+
344+
pos = len(entity_and_relation_df.index)
345+
entity_and_relation_df.loc[pos] = [node_1, node_2, edge, chunk_id]
346+
except ValueError:
347+
print(f"Error in add_relations_to_data: ,", node_1, node_2, edge, chunk_id)
348+
pass
349+
return entity_and_relation_df.dropna()
345350

346351

347352
def add_topic(data: pd.DataFrame, max_topics: int = 25) -> pd.DataFrame:

Project/backend/codebase/graph_creator/router.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import logging
23
import os
34
import uuid
@@ -7,6 +8,8 @@
78
from fastapi import UploadFile, File, HTTPException
89
from starlette.responses import JSONResponse
910

11+
from graph_creator.embedding_handler import embeddings_handler
12+
from graph_creator.schemas.graph_query import QueryRequest
1013
import graph_creator.graph_creator_main as graph_creator_main
1114
from graph_creator.dao.graph_job_dao import GraphJobDAO
1215
from graph_creator.schemas.graph_job import GraphJobCreate
@@ -193,6 +196,9 @@ async def delete_graph_job(
193196
graph_job_id = graph_job.id
194197
await graph_job_dao.delete_graph_job(graph_job)
195198
netx_services.delete_graph(graph_job_id)
199+
graphEmbeddingsHandler = embeddings_handler(graph_job, lazyLoad=True)
200+
graphEmbeddingsHandler.delete_embeddings()
201+
196202

197203

198204
@router.post("/create_graph/{graph_job_id}")
@@ -298,3 +304,54 @@ async def query_graph(
298304
graph = netx_services.load_graph(graph_job_id=graph_job_id)
299305
graph_keywords = analyze_graph_structure(graph)
300306
return graph_keywords
307+
308+
309+
@router.post("/graph_search/{graph_job_id}")
310+
async def query_graph(
311+
graph_job_id: uuid.UUID,
312+
request: QueryRequest,
313+
graph_job_dao: GraphJobDAO = Depends(),
314+
):
315+
"""
316+
Reads a graph job by id and tries to answer a query about the graph using embeddings
317+
318+
Args:
319+
graph_job_id (uuid.UUID): ID of the graph job to be read.
320+
request (QueryRequest): contains user query
321+
graph_job_dao (GraphJobDAO): graph job database access object
322+
323+
Returns:
324+
Answer to question from the user regarding the graph
325+
326+
Raises:
327+
HTTPException: If there is no graph job with the given ID.
328+
"""
329+
330+
g_job = await graph_job_dao.get_graph_job_by_id(graph_job_id)
331+
332+
if not g_job:
333+
raise HTTPException(status_code=404, detail="Graph job not found")
334+
if g_job.status != GraphStatus.GRAPH_READY:
335+
raise HTTPException(
336+
status_code=400,
337+
detail="No graph created for this job!",
338+
)
339+
340+
user_query = request.query
341+
#print(f"Received query: {user_query}")
342+
343+
graphEmbeddingsHandler = embeddings_handler(g_job)
344+
345+
if graphEmbeddingsHandler.is_embedded():
346+
#do search
347+
result = graphEmbeddingsHandler.search_graph(user_query, k=4)
348+
#print(result)
349+
answer = json.dumps(result)
350+
else:
351+
#can't answer because no embeddings exist
352+
answer = 'No embeddings found'
353+
354+
return JSONResponse(
355+
content={"answer": answer},
356+
status_code=200,
357+
)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from pydantic import BaseModel
2+
3+
class QueryRequest(BaseModel):
4+
query: str

Project/backend/codebase/graph_creator/services/file_handler.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
import os
33
from pathlib import Path
44

5-
from langchain_community.document_loaders import PyPDFLoader
5+
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
66
from langchain_community.document_loaders import TextLoader
77
from langchain_community.document_loaders import Docx2txtLoader
88
from langchain_community.document_loaders import UnstructuredPowerPointLoader
9+
from langchain_core.documents import Document
910

1011
from langchain_text_splitters import (
1112
RecursiveCharacterTextSplitter,
@@ -18,43 +19,68 @@ class FileHandler:
1819
def __init__(self, file_location: str):
1920
self.file_location = file_location
2021
self.file_loader = {
21-
".pdf": PyPDFLoader,
22-
".txt": TextLoader,
23-
".docx": Docx2txtLoader,
24-
".pptx": UnstructuredPowerPointLoader,
25-
".json": RecursiveJsonSplitter,
22+
".pdf": (PyPDFLoader, {}),
23+
".txt": (TextLoader, {}),
24+
".docx": (Docx2txtLoader, {}),
25+
".pptx": (
26+
UnstructuredPowerPointLoader,
27+
{"mode": "elements", "strategy": "fast", "join_docs_by_page": True}
28+
),
29+
".json": (RecursiveJsonSplitter, {}),
2630
}
2731

2832
if not os.path.isfile(self.file_location):
2933
raise ValueError("Invalid file path.")
3034

3135
def process_file_into_chunks(self):
32-
file_loader = self._get_file_loader()
36+
file_loader, kwargs = self._get_file_loader()
3337
if file_loader == RecursiveJsonSplitter:
3438
return self._get_json_chunks()
35-
loader = file_loader(self.file_location)
39+
join_docs_by_page = kwargs.pop("join_docs_by_page", False)
40+
loader = file_loader(self.file_location, **kwargs)
3641
docs = loader.load()
37-
splits = self._process_doc_to_chunks(docs)
42+
splits = self._process_doc_to_chunks(docs, join_docs_by_page=join_docs_by_page)
3843
return splits
3944

4045
@staticmethod
41-
def _process_doc_to_chunks(docs):
46+
def _process_doc_to_chunks(docs, join_docs_by_page: bool):
4247
if not docs:
4348
raise ValueError("Failed to load documents.")
4449

50+
if join_docs_by_page:
51+
new_docs = []
52+
current_doc = Document(page_content="")
53+
current_page = None
54+
new_docs.append(current_doc)
55+
for doc in docs:
56+
if doc.page_content == "":
57+
continue
58+
doc_current_page = doc.metadata.get("page_number", None)
59+
# if doc_current_page is None
60+
if current_page != doc_current_page and doc.metadata.get("category", None) not in ["PageBreak", None]:
61+
current_doc = Document(
62+
page_content=doc.page_content,
63+
metadata={"page": doc_current_page - 1 if doc_current_page else "No page"}
64+
)
65+
current_page = doc_current_page
66+
new_docs.append(current_doc)
67+
else:
68+
current_doc.page_content += f"\n {doc.page_content}"
69+
else:
70+
new_docs = docs
4571
# splits text into chunks including metadata for mapping from chunk to pdf page (splits[0].metadata['page'])
4672
text_splitter = RecursiveCharacterTextSplitter(
4773
chunk_size=os.getenv("CHUNK_SIZE", 1500), chunk_overlap=150
4874
)
49-
splits = text_splitter.split_documents(docs)
75+
splits = text_splitter.split_documents(new_docs)
5076
return splits
5177

5278
def _get_file_loader(self):
5379
_, extension = os.path.splitext(self.file_location)
54-
loader = self.file_loader.get(extension)
80+
loader, kwargs = self.file_loader.get(extension)
5581
if loader is None:
5682
raise ValueError("File format does not have a loader!")
57-
return loader
83+
return loader, kwargs
5884

5985
def _get_json_chunks(self):
6086
json_data = json.loads(Path(self.file_location).read_text())

Project/frontend/src/components/App/index.css

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -58,22 +58,3 @@ img {
5858
justify-content: center;
5959
gap: 10px;
6060
}
61-
62-
.main_wrapper {
63-
display: flex;
64-
flex-direction: column;
65-
align-items: center;
66-
gap: 20px;
67-
margin: 20px;
68-
min-width: 100%;
69-
min-height: 100%;
70-
}
71-
72-
.Appcontainer {
73-
display: flex;
74-
flex-direction: column;
75-
align-items: center;
76-
gap: 20px;
77-
min-width: 100%;
78-
min-height: 100%;
79-
}

Project/frontend/src/components/App/index.tsx

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,11 @@ import {
55
Routes,
66
} from 'react-router-dom';
77
import {
8-
AppBar,
98
createTheme,
109
CssBaseline,
11-
Divider,
1210
Paper,
1311
Stack,
1412
ThemeProvider,
15-
Toolbar,
1613
Typography,
1714
} from '@mui/material';
1815

@@ -37,7 +34,7 @@ function App() {
3734
<Stack direction="column" flex={1}>
3835
<Paper
3936
variant="elevation"
40-
elevation={0.7}
37+
elevation={1}
4138
component={Stack}
4239
display={'flex'}
4340
flexDirection={'row'}

0 commit comments

Comments
 (0)