Skip to content

Commit 865cb16

Browse files
committed
INITIAL RUN COMPLETE AND 100 CASES MANUALLY VERIFIED
1 parent e998163 commit 865cb16

9 files changed

+1005
-682
lines changed

data/radiology_db.csv

Lines changed: 469 additions & 6 deletions
Large diffs are not rendered by default.

notebooks/db_analysis.ipynb

Lines changed: 497 additions & 662 deletions
Large diffs are not rendered by default.

radiology_dataset_db/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ def get_model() -> str:
5656
DATASET_AVAILABILITY_INSTRUCTIONS = (
5757
"Determine whether the paper indicates that its dataset is publicly available.\n"
5858
"Return is_publicly_available = true if there is direct evidence in the provided text such as:\n"
59-
"- explicit language like 'open', 'publicly available', 'public', 'available', 'release', or a data availability statement\n"
60-
"- a non-DOI URL (e.g., GitHub, Zenodo, institutional repository, challenge site, or dataset website)\n"
59+
"- explicit language like 'open', 'publicly available', 'public', 'available', 'release', 'accessible', 'open-access', or a data availability statement\n"
60+
"- a non-DOI URL (e.g., GitHub, Zenodo, Kaggle, institutional repository, challenge site, or dataset website)\n"
6161
"- wording that readers can access or download the dataset\n\n"
6262
"Return false if:\n"
6363
"- the text only says data are available on request\n"

radiology_dataset_db/db_validation.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,12 @@ def compare_dbs(
102102
right_titles = _unique_title_set(df_right, merge_col)
103103

104104
report = {
105-
"unique_titles_left": len(left_titles),
106-
"unique_titles_right": len(right_titles),
107-
"unique_titles_combined": len(left_titles | right_titles),
105+
"titles_left": len(left_titles),
106+
"titles_right": len(right_titles),
107+
"titles_union": len(left_titles | right_titles),
108108
"titles_only_left": len(left_titles - right_titles),
109109
"titles_only_right": len(right_titles - left_titles),
110-
"titles_in_both": len(left_titles & right_titles),
110+
"titles_intersection": len(left_titles & right_titles),
111111
}
112112

113113
# merged_df = df_left.merge(
@@ -119,7 +119,6 @@ def compare_dbs(
119119
# )
120120
return report
121121

122-
123122
def verified_unverified_report(
124123
df_left: pd.DataFrame,
125124
df_right: pd.DataFrame,
@@ -137,7 +136,7 @@ def verified_unverified_report(
137136

138137
def build_sets(df: pd.DataFrame) -> Tuple[Set[str], Set[str]]:
139138
valid = df[[merge_col, verified_col]].copy()
140-
valid["_verified_bool"] = valid[verified_col].apply(_to_bool)
139+
valid["_verified_bool"] = valid[verified_col].apply(_to_bool).astype("boolean")
141140
valid = valid.dropna(subset=[merge_col, "_verified_bool"])
142141
valid[merge_col] = valid[merge_col].astype(str).str.strip()
143142
valid = valid[valid[merge_col] != ""]
@@ -150,16 +149,22 @@ def build_sets(df: pd.DataFrame) -> Tuple[Set[str], Set[str]]:
150149
right_verified, right_unverified = build_sets(df_right)
151150

152151
return {
153-
"verified_unique_left": len(left_verified),
154-
"verified_unique_right": len(right_verified),
155-
"verified_unique_combined": len(left_verified | right_verified),
152+
"total_left": len(left_verified) + len(left_unverified),
153+
"total_right": len(right_verified) + len(right_unverified),
154+
"verified_left": len(left_verified),
155+
"verified_right": len(right_verified),
156+
"verified_union": len(left_verified | right_verified),
156157
"verified_only_left": len(left_verified - right_verified),
157158
"verified_only_right": len(right_verified - left_verified),
158-
"unverified_unique_left": len(left_unverified),
159-
"unverified_unique_right": len(right_unverified),
160-
"unverified_unique_combined": len(left_unverified | right_unverified),
159+
"verified_intersection": len(left_verified & right_verified),
160+
"unverified_left": len(left_unverified),
161+
"unverified_right": len(right_unverified),
162+
"unverified_union": len(left_unverified | right_unverified),
161163
"unverified_only_left": len(left_unverified - right_unverified),
162164
"unverified_only_right": len(right_unverified - left_unverified),
165+
"unverified_intersection": len(left_unverified & right_unverified),
166+
"fraction_verified_left": len(left_verified) / (len(left_verified) + len(left_unverified)) if (len(left_verified) + len(left_unverified)) > 0 else float("nan"), # aka PPV
167+
"fraction_verified_right": len(right_verified) / (len(right_verified) + len(right_unverified)) if (len(right_verified) + len(right_unverified)) > 0 else float("nan"), # aka PPV
163168
}
164169

165170

radiology_dataset_db/extract_bulk_genomics_dataset_information_llm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class BulkGenomicsDataset(BaseModel):
5151
pmid: Optional[str] = None
5252
paper_citation_count: Optional[int] = None
5353
mesh_terms: List[str] = Field(default_factory=list)
54+
keywords: List[str] = Field(default_factory=list)
5455
pubmed_matches: Optional[List[List[str]]] = None
5556

5657

@@ -121,6 +122,7 @@ async def extract_bulk_genomics_dataset_info_with_agent(
121122
output.pmid = publication_metadata.get("pmid")
122123
output.paper_citation_count = publication_metadata.get("citation_count")
123124
output.mesh_terms = publication_metadata.get("mesh_terms")
125+
output.keywords = publication_metadata.get("keywords")
124126
output.pubmed_matches = publication_metadata.get("pubmed_matches")
125127

126128
if not output.name:

radiology_dataset_db/extract_radiology_dataset_information_llm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ class RadiologyDataset(BaseModel):
7070
pmid: Optional[str] = None
7171
paper_citation_count: Optional[int] = None
7272
mesh_terms: List[str] = Field(default_factory=list)
73+
keywords: List[str] = Field(default_factory=list)
7374
pubmed_matches: Optional[List[List[str]]] = None
7475

7576

@@ -193,6 +194,7 @@ async def extract_radiology_dataset_info_with_agent(
193194
output.pmid = publication_metadata.get("pmid")
194195
output.paper_citation_count = publication_metadata.get("citation_count")
195196
output.mesh_terms = publication_metadata.get("mesh_terms")
197+
output.keywords = publication_metadata.get("keywords")
196198
output.pubmed_matches = publication_metadata.get("pubmed_matches")
197199

198200
if not output.name:

radiology_dataset_db/extract_scrnaseq_dataset_information_llm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class ScRNASeqDataset(BaseModel):
6060
pmid: Optional[str] = None
6161
paper_citation_count: Optional[int] = None
6262
mesh_terms: List[str] = Field(default_factory=list)
63+
keywords: List[str] = Field(default_factory=list)
6364
pubmed_matches: Optional[List[List[str]]] = None
6465

6566

@@ -128,6 +129,7 @@ async def extract_scrnaseq_dataset_info_with_agent(
128129
output.pmid = publication_metadata.get("pmid")
129130
output.paper_citation_count = publication_metadata.get("citation_count")
130131
output.mesh_terms = publication_metadata.get("mesh_terms")
132+
output.keywords = publication_metadata.get("keywords")
131133
output.pubmed_matches = publication_metadata.get("pubmed_matches")
132134

133135
if not output.name:

radiology_dataset_db/extract_spatial_transcriptomics_dataset_information_llm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ class SpatialTranscriptomicsDataset(BaseModel):
6464
pmid: Optional[str] = None
6565
paper_citation_count: Optional[int] = None
6666
mesh_terms: List[str] = Field(default_factory=list)
67+
keywords: List[str] = Field(default_factory=list)
6768
pubmed_matches: Optional[List[List[str]]] = None
6869

6970

@@ -134,6 +135,7 @@ async def extract_spatial_transcriptomics_dataset_info_with_agent(
134135
output.pmid = publication_metadata.get("pmid")
135136
output.paper_citation_count = publication_metadata.get("citation_count")
136137
output.mesh_terms = publication_metadata.get("mesh_terms")
138+
output.keywords = publication_metadata.get("keywords")
137139
output.pubmed_matches = publication_metadata.get("pubmed_matches")
138140

139141
if not output.name:

radiology_dataset_db/pubmed_utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ def search_pubmed(pubmed_query: str, max_results: Optional[int] = None, batch_si
217217

218218
def fetch_pubmed_details(id_list, batch_size=200):
219219
results = []
220+
id_list = [str(id) for id in id_list] # ensure all IDs are strings
220221
for batch in tqdm(list(chunked(id_list, batch_size)), desc="Fetching PubMed details"):
221222
handle = Entrez.efetch(
222223
db="pubmed",
@@ -360,6 +361,16 @@ def _extract_mesh_terms(article) -> List[str]:
360361
except Exception:
361362
return []
362363

364+
def _extract_keywords(article) -> List[str]:
365+
try:
366+
keyword_list = article["MedlineCitation"].get("KeywordList", [])
367+
keywords = []
368+
for kw_list in keyword_list:
369+
keywords.extend([str(k) for k in kw_list])
370+
return keywords
371+
except Exception:
372+
return []
373+
363374
def _extract_pmid(article) -> Optional[str]:
364375
try:
365376
pmid = article["MedlineCitation"]["PMID"]
@@ -603,6 +614,7 @@ def extract_pubmed_metadata(article, citation_counts_by_pmid: Optional[Dict[str,
603614
"authors": _extract_authors(article),
604615
"journal": _extract_journal(article),
605616
"mesh_terms": mesh_terms,
617+
"keywords": _extract_keywords(article),
606618
"pmid": pmid,
607619
"citation_count": citation_counts_by_pmid.get(pmid) if pmid else None,
608620
"pubmed_matches": pubmed_matches,

0 commit comments

Comments
 (0)