cBioPortal · rmadupuri · Jun 3, 2025 · Jun 3, 2025 · Jun 3, 2025 · Jun 3, 2025
diff --git a/.chainlit/config.toml b/.chainlit/config.toml
@@ -0,0 +1,120 @@
+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+
+
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+
+# Duration (in seconds) of the user session expiry
+user_session_timeout = 1296000  # 15 days
+
+# Enable third parties caching (e.g., LangChain cache)
+cache = false
+
+# Authorized origins
+allow_origins = ["*"]
+
+[features]
+# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
+unsafe_allow_html = false
+
+# Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = false
+
+# Autoscroll new user messages at the top of the window
+user_message_autoscroll = true
+
+# Automatically tag threads with the current chat profile (if a chat profile is used)
+auto_tag_thread = true
+
+# Allow users to edit their own messages
+edit_message = true
+
+# Authorize users to spontaneously upload files with messages
+[features.spontaneous_file_upload]
+    enabled = true
+    # Define accepted file types using MIME types
+    # Examples:
+    # 1. For specific file types:
+    #    accept = ["image/jpeg", "image/png", "application/pdf"]
+    # 2. For all files of certain type:
+    #    accept = ["image/*", "audio/*", "video/*"]
+    # 3. For specific file extensions:
+    #    accept = { "application/octet-stream" = [".xyz", ".pdb"] }
+    # Note: Using "*/*" is not recommended as it may cause browser warnings
+    accept = ["*/*"]
+    max_files = 20
+    max_size_mb = 500
+
+[features.audio]
+    # Sample rate of the audio
+    sample_rate = 24000
+
+[features.mcp.sse]
+    enabled = true
+
+[features.mcp.stdio]
+    enabled = true
+    # Only the executables in the allow list can be used for MCP stdio server.
+    # Only need the base name of the executable, e.g. "npx", not "/usr/bin/npx".
+    # Please don't comment this line for now, we need it to parse the executable name.
+    allowed_executables = [ "npx", "uvx" ]
+
+[UI]
+# Name of the assistant.
+name = "cBioPubChat"
+
+default_theme = "light"
+
+# layout = "wide"
+
+# default_sidebar_state = "open"
+
+# Description of the assistant. This is used for HTML tags.
+# description = ""
+
+# Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full".
+cot = "full"
+
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+
+# Specify additional attributes for a custom CSS file
+# custom_css_attributes = "media=\"print\""
+
+# Specify a JavaScript file that can be used to customize the user interface.
+# The JavaScript file can be served from the public directory.
+# custom_js = "/public/test.js"
+
+# Specify additional attributes for custom JS file
+# custom_js_attributes = "async type = \"module\""
+
+# Custom login page image, relative to public directory or external URL
+# login_page_image = "/public/custom-background.jpg"
+
+# Custom login page image filter (Tailwind internal filters, no dark/light variants)
+# login_page_image_filter = "brightness-50 grayscale"
+# login_page_image_dark_filter = "contrast-200 blur-sm"
+
+# Specify a custom meta image url.
+# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
+
+# Specify a custom build directory for the frontend.
+# This can be used to customize the frontend code.
+# Be careful: If this is a relative path, it should not start with a slash.
+# custom_build = "./public/build"
+
+# Specify optional one or more custom links in the header.
+# [[UI.header_links]]
+#     name = "Issues"
+#     display_name = "Report Issue"
+#     icon_url = "https://avatars.githubusercontent.com/u/128686189?s=200&v=4"
+#     url = "https://github.com/Chainlit/chainlit/issues"
+
+[meta]
+generated_by = "2.5.5"
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 # Byte-compiled / optimized / DLL files
+.DS_Store
 __pycache__/
 *.py[cod]
 *$py.class
@@ -176,4 +177,10 @@ cython_debug/
 .idea
 .files
 chainlit.md
-.chainlit
+<<<<<<< HEAD
+.chainlit/translations
+=======
+.chainlit
+
+data/data_chromadb/
+>>>>>>> 5da0852556bc16a329537c695cee906b0b9814d1
diff --git a/README.md b/README.md
@@ -30,6 +30,11 @@ py tests/env.py
 # If all is successful, the test script should put a smile on your face!
 ```
 
+### Start app
+```shell
+chainlit run app.py -w
+```
+
 ## Sample Use Case
 
 > _“Which pathways are most commonly altered in ovarian cancer?”_

diff --git a/data/dummy.txt → __init__.py b/data/dummy.txt → __init__.py
diff --git a/api/__init__.py b/api/__init__.py
diff --git a/api/cbiopubchat.py b/api/cbiopubchat.py
@@ -0,0 +1,12 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from backend.rag import run_rag
+app = FastAPI()
+
+class PromptRequest(BaseModel):
+    prompt: str
+
+@app.post("/cbiopubchat")
+def cbiopubchat_endpoint(request: PromptRequest):
+    response = run_rag(request.prompt)
+    return {"response": response}
diff --git a/app.py b/app.py
@@ -0,0 +1,12 @@
+import chainlit as cl
+from backend import rag
+
+@cl.on_message
+async def on_message(message: cl.Message):
+    user_prompt = message.content
+    response = rag.run_rag(user_prompt)
+
+    # Send a response back to the user
+    await cl.Message(
+        content=f"{response}",
+    ).send()
diff --git a/backend/__init__.py b/backend/__init__.py
@@ -0,0 +1 @@
+from .rag import run_rag
diff --git a/backend/rag.py b/backend/rag.py
@@ -0,0 +1,80 @@
+import getpass
+import json
+import os
+from langchain_openai import OpenAIEmbeddings
+from langchain_chroma import Chroma
+from typing_extensions import List, TypedDict
+from langchain_core.documents import Document
+from langchain.chat_models import init_chat_model
+from langchain.prompts import PromptTemplate
+from langgraph.graph import START, StateGraph
+
+class State(TypedDict):
+    question: str
+    context: List[Document]
+    answer: str
+
+def run_rag(user_prompt: str):
+    def retrieve(state: State):
+        retrieved_docs = vector_store.similarity_search(state["question"])
+        return {"context": retrieved_docs}
+
+    def generate(state: State):
+        docs_content = "\n\n".join(doc.page_content for doc in state["context"])
+        messages = prompt.invoke({"question": state["question"], "context": docs_content})
+        response = llm.invoke(messages)
+        return {"answer": response.content}
+
+    prompt = PromptTemplate.from_template(
+        """
+        You are an assistant for question-answering tasks related to cBioPortal publications. Use the following context from cBioPortal publications to answer the question. If you don't know the answer, just say that you don't know. In your response, don't mention the word 'context' or refer to the context explicitly. Provide a concise answer.
+
+        ---
+        Context:
+        {context}
+        ---
+
+        Question:
+        {question}
+
+        Answer:
+        """
+    )
+    persist_directory = "./data/vectordb/chroma/pubmed/paper_and_pdf"
+    embedding_function = OpenAIEmbeddings(model="text-embedding-ada-002")
+    llm = init_chat_model("gpt-4o-mini", model_provider="openai")
+
+    vector_store = Chroma(
+    persist_directory=persist_directory,
+    embedding_function=embedding_function
+    )
+
+    graph_builder = StateGraph(State).add_sequence([retrieve, generate])
+    graph_builder.add_edge(START, "retrieve")
+    graph = graph_builder.compile()
+
+    result = graph.invoke({"question": user_prompt})
+
+    cbioportal_study_url = "https://www.cbioportal.org/study/summary?id="
+    filtered_metadata = [
+        {
+            "name": doc.metadata.get("name"),
+            "studyId": doc.metadata.get("studyId"),
+            "url": cbioportal_study_url + doc.metadata.get("studyId")
+        }
+        for doc in result["context"]
+    ]
+    seen_ids = set()
+    unique_studies = []
+    for item in filtered_metadata:
+        study_id = item.get("studyId")
+        if study_id and study_id not in seen_ids:
+            seen_ids.add(study_id)
+            unique_studies.append(item)
+
+    result = result["answer"] + "\n\n"
+    result = result + "Citations:\n"
+    for doc in unique_studies:
+        result = result + f"* [{doc.get('name')}]({doc.get('url')})\n"
+
+    return result
diff --git a/data/data_raw/README.md b/data/data_raw/README.md
@@ -0,0 +1,3 @@
+Expected filenames:
+* PMC: PMC3465532.txt
+* PMID: 23000897.txt