Skip to content

Commit 0fe9e4e

Browse files
Merge pull request joanby#1 from ed-donner/main
Update
2 parents 8a8493e + d0b83a5 commit 0fe9e4e

30 files changed

+8718
-0
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<!-- Use this file to provide workspace-specific custom instructions to Copilot. For more details, visit https://code.visualstudio.com/docs/copilot/copilot-customization#_use-a-githubcopilotinstructionsmd-file -->
2+
3+
This is a Streamlit web application for clinical trial protocol summarization. Use Streamlit best practices for UI and Python for backend logic. Integrate with ClinicalTrials.gov v2 API for study search and OpenAI for summarization.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
updates.md
2+
.env
3+
__pycache__/
4+
*.py[cod]
5+
*$py.class
6+
*.so
7+
.Python
8+
env/
9+
build/
10+
develop-eggs/
11+
dist/
12+
downloads/
13+
eggs/
14+
.eggs/
15+
lib/
16+
lib64/
17+
parts/
18+
sdist/
19+
var/
20+
*.egg-info/
21+
.installed.cfg
22+
*.egg
23+
venv/
24+
ENV/
25+
.streamlit/
26+
.idea/
27+
.vscode/
28+
*.swp
29+
*.swo
30+
.DS_Store
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Protocol Summarizer Webapp
2+
3+
A Streamlit web application for searching and summarizing clinical trial protocols from ClinicalTrials.gov using Large Language Models. This tool enables researchers and clinical professionals to quickly extract key information from clinical trial protocols.
4+
5+
## Features
6+
- Search for clinical trials by keyword
7+
- Display a list of studies with title and NCT number
8+
- Select a study to summarize
9+
- Fetch the protocol's brief summary from ClinicalTrials.gov API
10+
- Automatically summarize the protocol using OpenAI's LLM
11+
- Extract structured information like study design, population, interventions, and endpoints
12+
13+
## Installation
14+
15+
1. Clone this repository:
16+
```sh
17+
git clone https://github.com/albertoclemente/protocol_summarizer.git
18+
cd protocol_summarizer/protocol_summarizer_webapp
19+
```
20+
21+
2. Install dependencies:
22+
```sh
23+
pip install -r requirements.txt
24+
```
25+
26+
3. Create a `.env` file in the project root with your OpenAI API key:
27+
```
28+
OPENAI_API_KEY=your_api_key_here
29+
```
30+
31+
## Usage
32+
33+
1. Run the Streamlit app:
34+
```sh
35+
streamlit run app.py
36+
```
37+
38+
2. In your browser:
39+
- Enter a disease, condition, or keyword in the search box
40+
- Select the number of results to display
41+
- Click the "Search" button
42+
- Select a study from the results
43+
- Click "Summarize Protocol" to generate a structured summary
44+
45+
## Technical Details
46+
47+
- Uses ClinicalTrials.gov API v2 to retrieve study information
48+
- Implements fallback methods to handle API changes or failures
49+
- Extracts protocol brief summaries using reliable JSON parsing
50+
- Generates structured summaries using OpenAI's GPT models
51+
52+
## Requirements
53+
54+
- Python 3.7+
55+
- Streamlit
56+
- Requests
57+
- OpenAI Python library
58+
- python-dotenv
59+
60+
## Contribution
61+
62+
Contributions are welcome! Please feel free to submit a Pull Request.
63+
64+
## License
65+
66+
MIT License
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import os
2+
from dotenv import load_dotenv
3+
import streamlit as st
4+
import requests
5+
from openai import OpenAI
6+
7+
load_dotenv()
8+
9+
st.title("Protocol Summarizer")
10+
11+
st.markdown("""
12+
Search for clinical trials by keyword, select a study, and generate a protocol summary using an LLM.
13+
""")
14+
15+
# Search input
16+
17+
# Show results only after user presses Enter
18+
with st.form(key="search_form"):
19+
query = st.text_input("Enter a disease, study title, or keyword:")
20+
max_results = st.slider("Number of results", 1, 20, 5)
21+
submitted = st.form_submit_button("Search")
22+
23+
@st.cache_data(show_spinner=False)
24+
def search_clinical_trials(query, max_results=5):
25+
if not query:
26+
return []
27+
url = f"https://clinicaltrials.gov/api/v2/studies?query.term={query}&pageSize={max_results}&format=json"
28+
resp = requests.get(url)
29+
studies = []
30+
if resp.status_code == 200:
31+
data = resp.json()
32+
for study in data.get('studies', []):
33+
nct = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId', 'N/A')
34+
title = study.get('protocolSection', {}).get('identificationModule', {}).get('officialTitle', 'N/A')
35+
studies.append({'nct': nct, 'title': title})
36+
return studies
37+
38+
results = search_clinical_trials(query, max_results) if query else []
39+
40+
if results:
41+
st.subheader("Search Results")
42+
for i, study in enumerate(results):
43+
st.markdown(f"**{i+1}. {study['title']}** (NCT: {study['nct']})")
44+
selected = st.number_input("Select study number to summarize", min_value=1, max_value=len(results), value=1)
45+
selected_study = results[selected-1]
46+
st.markdown(f"### Selected Study\n**{selected_study['title']}** (NCT: {selected_study['nct']})")
47+
if st.button("Summarize Protocol"):
48+
# Fetch the brief summary for the selected study
49+
nct_id = selected_study['nct']
50+
51+
# Use the V2 API which we know works reliably
52+
url = f"https://clinicaltrials.gov/api/v2/studies/{nct_id}?format=json"
53+
with st.spinner("Fetching study details..."):
54+
resp = requests.get(url)
55+
brief = ""
56+
57+
if resp.status_code == 200:
58+
try:
59+
data = resp.json()
60+
61+
# V2 API has protocolSection at the root level
62+
if 'protocolSection' in data:
63+
desc_mod = data.get('protocolSection', {}).get('descriptionModule', {})
64+
brief = desc_mod.get('briefSummary', '')
65+
66+
# If briefSummary is empty, try detailedDescription
67+
if not brief:
68+
brief = desc_mod.get('detailedDescription', '')
69+
except Exception as e:
70+
st.error(f"Error parsing study data: {e}")
71+
72+
# If API fails, try HTML scraping as a fallback
73+
if not brief and resp.status_code != 200:
74+
st.warning(f"API returned status code {resp.status_code}. Trying alternative method...")
75+
html_url = f"https://clinicaltrials.gov/ct2/show/{nct_id}"
76+
html_resp = requests.get(html_url)
77+
78+
if "Brief Summary:" in html_resp.text:
79+
start = html_resp.text.find("Brief Summary:") + 15
80+
excerpt = html_resp.text[start:start+1000]
81+
82+
# Clean up HTML
83+
import re
84+
excerpt = re.sub('<[^<]+?>', ' ', excerpt)
85+
excerpt = re.sub('\\s+', ' ', excerpt)
86+
brief = excerpt.strip()
87+
88+
if not brief:
89+
st.error("No brief summary or detailed description found for this study.")
90+
st.stop()
91+
92+
# Now we have the brief summary, send it to the LLM
93+
openai = OpenAI()
94+
def user_prompt_for_protocol_brief(brief_text):
95+
return (
96+
"Extract the following details from the clinical trial brief summary in markdown format with clear section headings (e.g., ## Study Design, ## Population, etc.):\n"
97+
"- Study design\n"
98+
"- Population\n"
99+
"- Interventions\n"
100+
"- Primary and secondary endpoints\n"
101+
"- Study duration\n\n"
102+
f"Brief summary text:\n{brief_text}"
103+
)
104+
system_prompt = "You are a clinical research assistant. Extract and list the requested protocol details in markdown format with clear section headings."
105+
messages = [
106+
{"role": "system", "content": system_prompt},
107+
{"role": "user", "content": user_prompt_for_protocol_brief(brief)}
108+
]
109+
with st.spinner("Summarizing with LLM..."):
110+
try:
111+
response = openai.chat.completions.create(
112+
model="gpt-4o-mini",
113+
messages=messages
114+
)
115+
summary = response.choices[0].message.content
116+
st.markdown(summary)
117+
except Exception as e:
118+
st.error(f"LLM call failed: {e}")
119+
else:
120+
if query:
121+
st.info("No results found. Try a different keyword.")
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
streamlit
2+
openai
3+
requests
4+
python-dotenv

0 commit comments

Comments
 (0)