RAG-based-Question-Answering-System-/app.py at main · satzgits/RAG-based-Question-Answering-System- · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import streamlit as st
import sys
import os
from pathlib import Path

# Add src to path
sys.path.append(str(Path(__file__).parent))

from rag_pipeline import query_rag
from ingest import create_vector_db

# Page configuration
st.set_page_config(
    page_title="RAG Question Answering System",
    page_icon="🤖",
    layout="wide"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        font-weight: 700;
        margin-bottom: 1rem;
    }
    .sub-header {
        font-size: 1.2rem;
        color: #666;
        margin-bottom: 2rem;
    }
    .source-box {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        margin-top: 1rem;
    }
</style>
""", unsafe_allow_html=True)

# Header
st.markdown('<div class="main-header">RAG Question Answering System</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-header">Ask questions about your documents using AI</div>', unsafe_allow_html=True)

# Sidebar
with st.sidebar:
    st.header("📚 Document Management")

    # File upload
    uploaded_files = st.file_uploader(
        "Upload documents (TXT or PDF)",
        type=['txt', 'pdf'],
        accept_multiple_files=True
    )

    if uploaded_files:
        data_dir = Path("data")
        data_dir.mkdir(exist_ok=True)

        for uploaded_file in uploaded_files:
            file_path = data_dir / uploaded_file.name
            with open(file_path, "wb") as f:
                f.write(uploaded_file.getbuffer())

        st.success(f"Uploaded {len(uploaded_files)} file(s)")

    # Ingest button
    if st.button("🔄 Process Documents", type="primary"):
        with st.spinner("Processing documents..."):
            success = create_vector_db()
            if success:
                st.success("Documents processed successfully!")
                st.rerun()
            else:
                st.error("Error processing documents")

    st.divider()

    # Settings
    st.header("⚙️ Settings")
    st.caption("Current configuration:")
    st.caption("• Chunk size: 500")
    st.caption("• Chunk overlap: 50")
    st.caption("• Top-K retrieval: 3")
    st.caption("• Model: Llama 3.1-8B")

# Main area
tab1, tab2 = st.tabs(["💬 Ask Questions", "📊 About"])

with tab1:
    # Check if vector store exists
    if not Path("faiss_index").exists():
        st.warning("⚠️ No documents have been processed yet. Please upload and process documents first.")
    else:
        # Question input
        question = st.text_input(
            "Enter your question:",
            placeholder="What is quantum computing?",
            key="question_input"
        )

        col1, col2 = st.columns([1, 5])
        with col1:
            ask_button = st.button("🔍 Ask", type="primary")

        if ask_button and question:
            with st.spinner("Searching documents and generating answer..."):
                answer, sources = query_rag(question)

                # Display answer
                st.markdown("### Answer:")
                st.markdown(answer)

                # Display sources
                if sources:
                    st.markdown("### Sources:")
                    for i, doc in enumerate(sources, 1):
                        source_file = doc.metadata.get('source', 'Unknown')
                        source_name = Path(source_file).name
                        with st.expander(f"📄 Source {i}: {source_name}"):
                            st.text(doc.page_content[:500] + "..." if len(doc.page_content) > 500 else doc.page_content)

        # Example questions
        st.markdown("---")
        st.markdown("#### Example Questions:")
        example_questions = [
            "What is quantum computing?",
            "How does blockchain work?",
            "What are the main causes of climate change?",
            "What is artificial intelligence?"
        ]

        cols = st.columns(2)
        for i, eq in enumerate(example_questions):
            with cols[i % 2]:
                if st.button(eq, key=f"example_{i}"):
                    st.session_state.question_input = eq
                    st.rerun()

with tab2:
    st.markdown("""
    ## About This System

    This is a **Retrieval-Augmented Generation (RAG)** system that answers questions based on your documents.

    ### How It Works:

    1. **Upload Documents**: Add your TXT or PDF files
    2. **Process**: The system splits documents into chunks and creates vector embeddings
    3. **Ask Questions**: The system finds relevant chunks and generates accurate answers

    ### Technology Stack:

    - **LangChain**: RAG framework
    - **FAISS**: Vector database for similarity search
    - **Groq API**: Fast LLM inference (Llama 3.1-8B)
    - **HuggingFace**: Local embeddings
    - **Streamlit**: Web interface

    ### Features:

    - ✅ PDF and TXT file support
    - ✅ Intelligent chunking strategies
    - ✅ Source attribution
    - ✅ Fast and accurate responses
    - ✅ Free to use (Groq free tier)

    ---

    **Built with ❤️ using LangChain and Streamlit**
    """)

# Footer
st.markdown("---")
st.caption("💡 Tip: Upload domain-specific documents for more accurate answers in that field.")