Skip to content

Commit 74968ff

Browse files
author
d4rkc0de
committed
Refactor hash method to eliminate redundant file loading
1 parent 03db010 commit 74968ff

File tree

2 files changed

+17
-6
lines changed

2 files changed

+17
-6
lines changed

backend/app/run.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
import logging
77
from pathlib import Path
8+
import hashlib
89

910
from .database import SQLiteDB
1011
from .settings import CustomFormatter
@@ -23,11 +24,12 @@
2324

2425
async def summarize_document(doc: Document):
2526
logger.info(f"Processing file {doc.metadata['file_path']}")
26-
if db.is_file_exist(doc.metadata['file_path'], doc.hash):
27+
doc_hash = get_file_hash(doc.metadata['file_path'])
28+
if db.is_file_exist(doc.metadata['file_path'], doc_hash):
2729
summary = db.get_file_summary(doc.metadata['file_path'])
2830
else:
2931
summary = await model.summarize_document_api(doc.text)
30-
db.insert_file_summary(doc.metadata['file_path'], doc.hash, summary)
32+
db.insert_file_summary(doc.metadata['file_path'], doc_hash, summary)
3133
return {
3234
"file_path": doc.metadata['file_path'],
3335
"summary": summary
@@ -36,11 +38,12 @@ async def summarize_document(doc: Document):
3638

3739
async def summarize_image_document(doc: ImageDocument):
3840
logger.info(f"Processing image {doc.image_path}")
39-
if db.is_file_exist(doc.image_path, doc.hash):
41+
image_hash = get_file_hash(doc.image_path)
42+
if db.is_file_exist(doc.image_path, image_hash):
4043
summary = db.get_file_summary(doc.image_path)
4144
else:
4245
summary = await model.summarize_image_api(image_path=doc.image_path)
43-
db.insert_file_summary(doc.image_path, doc.hash, summary)
46+
db.insert_file_summary(doc.image_path, image_hash, summary)
4447
return {
4548
"file_path": doc.image_path,
4649
"summary": summary
@@ -129,11 +132,19 @@ def update_file(root_path, item):
129132
os.makedirs(dst_dir)
130133
if os.path.isfile(src_file):
131134
shutil.move(src_file, dst_file)
132-
new_hash = SimpleDirectoryReader(input_files=[dst_file]).load_data()[0].hash
135+
new_hash = get_file_hash(dst_file)
133136
db.update_file(src_file, dst_file, new_hash)
134137

135138

136139
async def search_files(root_path: str, recursive: bool, required_exts: list, search_query: str):
137140
summaries = await get_dir_summaries(root_path, recursive, required_exts)
138141
files = await model.search_files_api(summaries, search_query)
139142
return files
143+
144+
145+
def get_file_hash(file_path):
146+
hash_func = hashlib.new('sha256')
147+
with open(file_path, 'rb') as f:
148+
while chunk := f.read(8192):
149+
hash_func.update(chunk)
150+
return hash_func.hexdigest()

backend/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ openai
44
pydantic-settings
55
llama-index
66
# needed by llama index
7-
git+https://github.com/openai/whisper.git
7+
git+https://github.com/openai/whisper.git # heavy library, remove if you don't want to treat media files
88
pydub
99
docx2txt
1010
nbconvert

0 commit comments

Comments
 (0)