Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
ff9554a
add studies.json
Jun 3, 2025
8298983
add extraction script
rmadupuri Jun 3, 2025
58dee88
rag v1 done
zainasir Jun 3, 2025
c4b6090
finished structuring output response
zainasir Jun 3, 2025
0f1db33
Co-authored-by: Ramya Madupuri <[email protected]>
cannin Jun 3, 2025
284c45a
Update to requirements.txt based on GSOC and hackathon code
cannin Jun 3, 2025
758b236
Ignore chromaDb sqlite file (too big)
cannin Jun 3, 2025
623a175
add fast api endpoint for cbiopubchat
zainasir Jun 3, 2025
57460c6
add dummy data for git to show empty dir
zainasir Jun 3, 2025
0adf3ac
Intermediate update to make sure only needed modules are loaded and l…
cannin Jun 3, 2025
9ab53d5
More intermediate clean up (e.g., move functions to top); added TODOs
cannin Jun 3, 2025
3fba51e
Attempt to remove indra dependency
cannin Jun 4, 2025
b629d04
ipython for testing
cannin Jun 4, 2025
93eb51d
Documentation; Intermediate cleanup edits
cannin Jun 4, 2025
3f7cab7
Working RAG example for testing
cannin Jun 4, 2025
b1dce77
Add command-line argument parsing for testing and document loading
cannin Jun 4, 2025
3a18222
Refactor get_pubmed_chain and update comments for clarity; update doc…
cannin Jun 4, 2025
9db7659
Add README to data_raw
cannin Jun 4, 2025
82284e8
Add functionality to download PubMed papers and extract text
cannin Jun 4, 2025
04211e9
Added sample xml and txt files to data raw
cannin Jun 4, 2025
b844712
Commented out debugging code; added TODO
cannin Jun 4, 2025
0866081
Refactor get_pubmed_chain and predict functions; remove unused get_re…
cannin Jun 4, 2025
2101c3b
Update default load directory for documents to data/data_raw/txt
cannin Jun 4, 2025
e2fc6d3
Add TODO to remove unnecessary imports in pubmed_data_loader.py
cannin Jun 4, 2025
160f76d
Clarify comments
cannin Jun 4, 2025
831b414
Minor name edits
cannin Jun 4, 2025
4eeb921
Fix naming bug
cannin Jun 4, 2025
eca2c86
Edit test question
cannin Jun 4, 2025
2755b23
Documentation edits
cannin Jun 4, 2025
45d8501
attach chainlit ui interface
zainasir Jun 4, 2025
1e46c86
add instructions for running the app
zainasir Jun 4, 2025
d25d8d4
fix chroma import
rmadupuri Jun 4, 2025
8e907d3
add dockerfile
zainasir Jun 4, 2025
b1623ae
txt and xml files
sbabyanusha Jun 4, 2025
5da0852
text classification
sbabyanusha Jun 4, 2025
5b9f16a
fix merge
rmadupuri Jun 4, 2025
fbd8579
resolve conflict
rmadupuri Jun 4, 2025
eb08fae
update splitter
rmadupuri Jun 4, 2025
9ea20e2
check pmcid
sbabyanusha Jun 4, 2025
c83cca0
Update classify_txt_articles.py
sbabyanusha Jun 4, 2025
b90f21b
Update check_pmcid.py
sbabyanusha Jun 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
120 changes: 120 additions & 0 deletions .chainlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
[project]
# Whether to enable telemetry (default: true). No personal data is collected.
enable_telemetry = true


# List of environment variables to be provided by each user to use the app.
user_env = []

# Duration (in seconds) during which the session is saved when the connection is lost
session_timeout = 3600

# Duration (in seconds) of the user session expiry
user_session_timeout = 1296000 # 15 days

# Enable third parties caching (e.g., LangChain cache)
cache = false

# Authorized origins
allow_origins = ["*"]

[features]
# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
unsafe_allow_html = false

# Process and display mathematical expressions. This can clash with "$" characters in messages.
latex = false

# Autoscroll new user messages at the top of the window
user_message_autoscroll = true

# Automatically tag threads with the current chat profile (if a chat profile is used)
auto_tag_thread = true

# Allow users to edit their own messages
edit_message = true

# Authorize users to spontaneously upload files with messages
[features.spontaneous_file_upload]
enabled = true
# Define accepted file types using MIME types
# Examples:
# 1. For specific file types:
# accept = ["image/jpeg", "image/png", "application/pdf"]
# 2. For all files of certain type:
# accept = ["image/*", "audio/*", "video/*"]
# 3. For specific file extensions:
# accept = { "application/octet-stream" = [".xyz", ".pdb"] }
# Note: Using "*/*" is not recommended as it may cause browser warnings
accept = ["*/*"]
max_files = 20
max_size_mb = 500

[features.audio]
# Sample rate of the audio
sample_rate = 24000

[features.mcp.sse]
enabled = true

[features.mcp.stdio]
enabled = true
# Only the executables in the allow list can be used for MCP stdio server.
# Only need the base name of the executable, e.g. "npx", not "/usr/bin/npx".
# Please don't comment this line for now, we need it to parse the executable name.
allowed_executables = [ "npx", "uvx" ]

[UI]
# Name of the assistant.
name = "cBioPubChat"

default_theme = "light"

# layout = "wide"

# default_sidebar_state = "open"

# Description of the assistant. This is used for HTML tags.
# description = ""

# Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full".
cot = "full"

# Specify a CSS file that can be used to customize the user interface.
# The CSS file can be served from the public directory or via an external link.
# custom_css = "/public/test.css"

# Specify additional attributes for a custom CSS file
# custom_css_attributes = "media=\"print\""

# Specify a JavaScript file that can be used to customize the user interface.
# The JavaScript file can be served from the public directory.
# custom_js = "/public/test.js"

# Specify additional attributes for custom JS file
# custom_js_attributes = "async type = \"module\""

# Custom login page image, relative to public directory or external URL
# login_page_image = "/public/custom-background.jpg"

# Custom login page image filter (Tailwind internal filters, no dark/light variants)
# login_page_image_filter = "brightness-50 grayscale"
# login_page_image_dark_filter = "contrast-200 blur-sm"

# Specify a custom meta image url.
# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"

# Specify a custom build directory for the frontend.
# This can be used to customize the frontend code.
# Be careful: If this is a relative path, it should not start with a slash.
# custom_build = "./public/build"

# Specify optional one or more custom links in the header.
# [[UI.header_links]]
# name = "Issues"
# display_name = "Report Issue"
# icon_url = "https://avatars.githubusercontent.com/u/128686189?s=200&v=4"
# url = "https://github.com/Chainlit/chainlit/issues"

[meta]
generated_by = "2.5.5"
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Byte-compiled / optimized / DLL files
.DS_Store
__pycache__/
*.py[cod]
*$py.class
Expand Down Expand Up @@ -176,4 +177,10 @@ cython_debug/
.idea
.files
chainlit.md
.chainlit
<<<<<<< HEAD
.chainlit/translations
=======
.chainlit

data/data_chromadb/
>>>>>>> 5da0852556bc16a329537c695cee906b0b9814d1
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ py tests/env.py
# If all is successful, the test script should put a smile on your face!
```

### Start app
```shell
chainlit run app.py -w
```

## Sample Use Case

> _“Which pathways are most commonly altered in ovarian cancer?”_
Expand Down
File renamed without changes.
Empty file added api/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions api/cbiopubchat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from fastapi import FastAPI
from pydantic import BaseModel
from backend.rag import run_rag
app = FastAPI()

class PromptRequest(BaseModel):
prompt: str

@app.post("/cbiopubchat")
def cbiopubchat_endpoint(request: PromptRequest):
response = run_rag(request.prompt)
return {"response": response}
12 changes: 12 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import chainlit as cl
from backend import rag

@cl.on_message
async def on_message(message: cl.Message):
user_prompt = message.content
response = rag.run_rag(user_prompt)

# Send a response back to the user
await cl.Message(
content=f"{response}",
).send()
1 change: 1 addition & 0 deletions backend/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .rag import run_rag
80 changes: 80 additions & 0 deletions backend/rag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import getpass
import json
import os
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from typing_extensions import List, TypedDict
from langchain_core.documents import Document
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langgraph.graph import START, StateGraph

class State(TypedDict):
question: str
context: List[Document]
answer: str

def run_rag(user_prompt: str):
def retrieve(state: State):
retrieved_docs = vector_store.similarity_search(state["question"])
return {"context": retrieved_docs}

def generate(state: State):
docs_content = "\n\n".join(doc.page_content for doc in state["context"])
messages = prompt.invoke({"question": state["question"], "context": docs_content})
response = llm.invoke(messages)
return {"answer": response.content}

prompt = PromptTemplate.from_template(
"""
You are an assistant for question-answering tasks related to cBioPortal publications. Use the following context from cBioPortal publications to answer the question. If you don't know the answer, just say that you don't know. In your response, don't mention the word 'context' or refer to the context explicitly. Provide a concise answer.

---
Context:
{context}
---

Question:
{question}

Answer:
"""
)
persist_directory = "./data/vectordb/chroma/pubmed/paper_and_pdf"
embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002")
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

vector_store = Chroma(
persist_directory=persist_directory,
embedding_function=embedding_function
)

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

result = graph.invoke({"question": user_prompt})

cbioportal_study_url = "https://www.cbioportal.org/study/summary?id="
filtered_metadata = [
{
"name": doc.metadata.get("name"),
"studyId": doc.metadata.get("studyId"),
"url": cbioportal_study_url + doc.metadata.get("studyId")
}
for doc in result["context"]
]
seen_ids = set()
unique_studies = []
for item in filtered_metadata:
study_id = item.get("studyId")
if study_id and study_id not in seen_ids:
seen_ids.add(study_id)
unique_studies.append(item)

result = result["answer"] + "\n\n"
result = result + "Citations:\n"
for doc in unique_studies:
result = result + f"* [{doc.get('name')}]({doc.get('url')})\n"

return result
3 changes: 3 additions & 0 deletions data/data_raw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Expected filenames:
* PMC: PMC3465532.txt
* PMID: 23000897.txt
Loading