@@ -92,9 +92,13 @@ def query_rcsb(max_resolution=2.5, min_length=50, max_length=300, max_results=15
9292 - Polymer entity length between min_length and max_length
9393 - Protein entity type
9494
95+ Paginates automatically (RCSB caps at 10,000 rows per request).
96+
9597 Returns list of PDB IDs (4-letter codes, uppercase).
9698 """
97- query = {
99+ PAGE_SIZE = 10000 # RCSB maximum rows per request
100+
101+ base_query = {
98102 "query" : {
99103 "type" : "group" ,
100104 "logical_operator" : "and" ,
@@ -139,7 +143,6 @@ def query_rcsb(max_resolution=2.5, min_length=50, max_length=300, max_results=15
139143 },
140144 "return_type" : "entry" ,
141145 "request_options" : {
142- "paginate" : {"start" : 0 , "rows" : max_results },
143146 "results_content_type" : ["experimental" ],
144147 "sort" : [{"sort_by" : "rcsb_entry_info.resolution_combined" , "direction" : "asc" }],
145148 },
@@ -153,18 +156,46 @@ def query_rcsb(max_resolution=2.5, min_length=50, max_length=300, max_results=15
153156 max_results ,
154157 )
155158
156- req = Request (
157- RCSB_SEARCH_URL ,
158- data = json .dumps (query ).encode ("utf-8" ),
159- headers = {"Content-Type" : "application/json" },
160- )
159+ pdb_ids = []
160+ start = 0
161+ total_count = None
162+
163+ while len (pdb_ids ) < max_results :
164+ rows = min (PAGE_SIZE , max_results - len (pdb_ids ))
165+ query = {
166+ ** base_query ,
167+ "request_options" : {
168+ ** base_query ["request_options" ],
169+ "paginate" : {"start" : start , "rows" : rows },
170+ },
171+ }
172+
173+ req = Request (
174+ RCSB_SEARCH_URL ,
175+ data = json .dumps (query ).encode ("utf-8" ),
176+ headers = {"Content-Type" : "application/json" },
177+ )
178+
179+ with urlopen (req , timeout = 60 ) as resp :
180+ data = json .loads (resp .read ().decode ("utf-8" ))
181+
182+ if total_count is None :
183+ total_count = data .get ("total_count" , 0 )
184+
185+ page_ids = [r ["identifier" ] for r in data .get ("result_set" , [])]
186+ if not page_ids :
187+ break
188+
189+ pdb_ids .extend (page_ids )
190+ start += len (page_ids )
191+ logger .info (
192+ "RCSB page: fetched %d (total so far: %d, available: %d)" , len (page_ids ), len (pdb_ids ), total_count
193+ )
161194
162- with urlopen ( req , timeout = 60 ) as resp :
163- data = json . loads ( resp . read (). decode ( "utf-8" ))
195+ if start >= total_count :
196+ break
164197
165- total_count = data .get ("total_count" , 0 )
166- pdb_ids = [r ["identifier" ] for r in data .get ("result_set" , [])]
167- logger .info ("RCSB returned %d results (total available: %d)" , len (pdb_ids ), total_count )
198+ logger .info ("RCSB query complete: %d results (total available: %d)" , len (pdb_ids ), total_count )
168199 return pdb_ids
169200
170201
0 commit comments