forked from marc-shade/Ollama-Workbench
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmanage_corpus.py
More file actions
93 lines (82 loc) · 4.06 KB
/
manage_corpus.py
File metadata and controls
93 lines (82 loc) · 4.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# manage_corpus.py
import streamlit as st
import os
import shutil
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
def manage_corpus():
st.header("🗂 Manage Corpus")
# Corpus folder
corpus_folder = "corpus"
if not os.path.exists(corpus_folder):
os.makedirs(corpus_folder)
# List existing corpus
corpus_list = [f for f in os.listdir(corpus_folder) if os.path.isdir(os.path.join(corpus_folder, f))]
st.subheader("⚫ Existing Corpus")
if corpus_list:
for corpus in corpus_list:
col1, col2, col3 = st.columns([2, 1, 1]) # Add a column for renaming
with col1:
st.write(corpus)
with col2:
if st.button("✏️", key=f"rename_corpus_{corpus}"):
st.session_state.rename_corpus = corpus
st.rerun()
with col3:
if st.button("🗑️", key=f"delete_corpus_{corpus}"):
shutil.rmtree(os.path.join(corpus_folder, corpus))
st.success(f"Corpus '{corpus}' deleted.")
st.rerun()
else:
st.write("No existing corpus found.")
# Handle renaming corpus
if "rename_corpus" in st.session_state and st.session_state.rename_corpus:
corpus_to_rename = st.session_state.rename_corpus
new_corpus_name = st.text_input(f"Rename corpus '{corpus_to_rename}' to:", value=corpus_to_rename, key=f"rename_corpus_input_{corpus_to_rename}")
if st.button("Confirm Rename", key=f"confirm_rename_{corpus_to_rename}"):
if new_corpus_name:
os.rename(os.path.join(corpus_folder, corpus_to_rename), os.path.join(corpus_folder, new_corpus_name))
st.success(f"Corpus renamed to '{new_corpus_name}'")
st.session_state.rename_corpus = None
st.rerun()
else:
st.error("Please enter a new corpus name.")
st.subheader("➕ Create New Corpus")
# Create corpus from files
st.write("**From Files:**")
files_folder = "files"
allowed_extensions = ['.json', '.txt']
files = [f for f in os.listdir(files_folder) if os.path.isfile(os.path.join(files_folder, f)) and os.path.splitext(f)[1].lower() in allowed_extensions]
selected_files = st.multiselect("Select files to create corpus:", files, key="create_corpus_files")
corpus_name = st.text_input("Enter a name for the corpus:", key="create_corpus_name")
if st.button("Create Corpus from Files", key="create_corpus_button"):
if selected_files and corpus_name:
create_corpus_from_files(corpus_folder, corpus_name, files_folder, selected_files)
st.success(f"Corpus '{corpus_name}' created from selected files.")
st.rerun()
else:
st.error("Please select files and enter a corpus name.")
def create_corpus_from_files(corpus_folder, corpus_name, files_folder, selected_files):
corpus_path = os.path.join(corpus_folder, corpus_name)
os.makedirs(corpus_path, exist_ok=True)
# Combine all selected file content into one text
all_text = ""
for file in selected_files:
file_path = os.path.join(files_folder, file)
with open(file_path, "r", encoding='utf-8') as f:
file_content = f.read()
all_text += file_content + "\n\n"
create_corpus_from_text(corpus_folder, corpus_name, all_text)
def create_corpus_from_text(corpus_folder, corpus_name, corpus_text):
corpus_path = os.path.join(corpus_folder, corpus_name)
os.makedirs(corpus_path, exist_ok=True)
# Create Langchain documents
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(corpus_text)
docs = [Document(page_content=t) for t in texts]
# Create and load the vector database
embeddings = OllamaEmbeddings()
db = Chroma.from_documents(docs, embeddings, persist_directory=corpus_path)
db.persist()