Skip to content

Commit cc8cfb4

Browse files
committed
X
Signed-off-by: Jonathan Mitchell <jomitchell@nvidia.com>
1 parent 199c342 commit cc8cfb4

File tree

1 file changed

+43
-12
lines changed

1 file changed

+43
-12
lines changed

bionemo-recipes/recipes/esm2_minifold_te/data/prepare_pdb_dataset_large.py

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,13 @@ def query_rcsb(max_resolution=2.5, min_length=50, max_length=300, max_results=15
9292
- Polymer entity length between min_length and max_length
9393
- Protein entity type
9494
95+
Paginates automatically (RCSB caps at 10,000 rows per request).
96+
9597
Returns list of PDB IDs (4-letter codes, uppercase).
9698
"""
97-
query = {
99+
PAGE_SIZE = 10000 # RCSB maximum rows per request
100+
101+
base_query = {
98102
"query": {
99103
"type": "group",
100104
"logical_operator": "and",
@@ -139,7 +143,6 @@ def query_rcsb(max_resolution=2.5, min_length=50, max_length=300, max_results=15
139143
},
140144
"return_type": "entry",
141145
"request_options": {
142-
"paginate": {"start": 0, "rows": max_results},
143146
"results_content_type": ["experimental"],
144147
"sort": [{"sort_by": "rcsb_entry_info.resolution_combined", "direction": "asc"}],
145148
},
@@ -153,18 +156,46 @@ def query_rcsb(max_resolution=2.5, min_length=50, max_length=300, max_results=15
153156
max_results,
154157
)
155158

156-
req = Request(
157-
RCSB_SEARCH_URL,
158-
data=json.dumps(query).encode("utf-8"),
159-
headers={"Content-Type": "application/json"},
160-
)
159+
pdb_ids = []
160+
start = 0
161+
total_count = None
162+
163+
while len(pdb_ids) < max_results:
164+
rows = min(PAGE_SIZE, max_results - len(pdb_ids))
165+
query = {
166+
**base_query,
167+
"request_options": {
168+
**base_query["request_options"],
169+
"paginate": {"start": start, "rows": rows},
170+
},
171+
}
172+
173+
req = Request(
174+
RCSB_SEARCH_URL,
175+
data=json.dumps(query).encode("utf-8"),
176+
headers={"Content-Type": "application/json"},
177+
)
178+
179+
with urlopen(req, timeout=60) as resp:
180+
data = json.loads(resp.read().decode("utf-8"))
181+
182+
if total_count is None:
183+
total_count = data.get("total_count", 0)
184+
185+
page_ids = [r["identifier"] for r in data.get("result_set", [])]
186+
if not page_ids:
187+
break
188+
189+
pdb_ids.extend(page_ids)
190+
start += len(page_ids)
191+
logger.info(
192+
"RCSB page: fetched %d (total so far: %d, available: %d)", len(page_ids), len(pdb_ids), total_count
193+
)
161194

162-
with urlopen(req, timeout=60) as resp:
163-
data = json.loads(resp.read().decode("utf-8"))
195+
if start >= total_count:
196+
break
164197

165-
total_count = data.get("total_count", 0)
166-
pdb_ids = [r["identifier"] for r in data.get("result_set", [])]
167-
logger.info("RCSB returned %d results (total available: %d)", len(pdb_ids), total_count)
198+
logger.info("RCSB query complete: %d results (total available: %d)", len(pdb_ids), total_count)
168199
return pdb_ids
169200

170201

0 commit comments

Comments
 (0)