Merge pull request #27 from microbiomedata/22-create-a-general-batch-query-function

hesspnnl · web-flow · commit 5f3835c12381 · 2025-04-01T15:09:35.000-04:00
22 create a general batch query function
diff --git a/nmdc_api_utilities/collection_search.py b/nmdc_api_utilities/collection_search.py
@@ -175,7 +175,7 @@ def get_record_by_id(
         results = response.json()
         return results
     
-    def check_ids_exist(self, ids: list, chunk_size=100) -> bool:
+    def check_ids_exist(self, ids: list, chunk_size=100, return_missing_ids=False) -> bool:
         """
         Check if the IDs exist in the collection.
 
@@ -187,6 +187,8 @@ def check_ids_exist(self, ids: list, chunk_size=100) -> bool:
             A list of IDs to check if they exist in the collection.
         chunk_size : int
             The number of IDs to check in each query. Default is 100.
+        return_missing_ids : bool
+            If True, and if ids are missing in the collection, return the list of IDs that do not exist in the collection. Default is False.
         Returns
         -------
         bool
@@ -201,16 +203,46 @@ def check_ids_exist(self, ids: list, chunk_size=100) -> bool:
         # to avoid the maximum URL length limit
         ids_test = list(set(ids))
         for i in range(0, len(ids_test), chunk_size):
-            chunk = ids[i:i + chunk_size]
+            chunk = ids_test[i:i + chunk_size]
             filter_dict = {
             "id": {"$in": chunk}
             }
             filter_json_string = json.dumps(filter_dict, separators=(',', ':'))
 
             results = self.get_records(filter=filter_json_string, max_page_size=len(chunk), fields="id")
-            if len(results) != len(chunk):
-                raise ValueError(f"IDs not found in collection: {set(chunk) - set([r['id'] for r in results])}")
+            if len(results) != len(chunk) and return_missing_ids:
+                missing_ids = list(set(chunk) - set([record["id"] for record in results]))
+                return False, missing_ids
+            elif len(results) != len(chunk) and not return_missing_ids:
+                return False
         return True
+    
+    def get_batch_records(self, id_list: list, search_field:str, chunk_size=100, fields="") -> list:
+        """
+        Get a batch of records from the collection by a list of input IDs. This method is used to identify records that include any of the IDs from the input list, matching the search_field.
+        This is using the MongoDB filter keyword $in to identify other records that include the input IDs.
+        params:
+            id_list: list
+                A list of IDs to get records for.
+            search_field: str
+                The field to search for. This must match a field from the NMDC Schema. 
+            chunk_size: int
+                The number of IDs to get in each query. Default is 100.
+            fields: str
+                The fields to return. Default is all fields.
+        returns:
+            list: A list of records.
+        """
+        dp = DataProcessing()
+        results = []
+        id_list = list(set(id_list))
+        chunks = dp.split_list(input_list=id_list, chunk_size=chunk_size)
+        for chunk in chunks:
+            chunk = dp._string_mongo_list(data=chunk)
+            filter = f'{{"{search_field}": {{"$in": {chunk}}}}}'
+            res = self.get_records(filter=filter, max_page_size=len(chunk), fields=fields, all_pages=True)
+            results += res
+        return results
 
 
 if __name__ == "__main__":
diff --git a/nmdc_api_utilities/data_processing.py b/nmdc_api_utilities/data_processing.py
@@ -24,7 +24,22 @@ def convert_to_df(self, data: list) -> pd.DataFrame:
                 A list of dictionaries.
         """
         return pd.DataFrame(data)
-
+    def split_list(self, input_list:list, chunk_size:int=100)->list:
+        """
+        Split a list into chunks of a specified size.
+        params:
+            input_list: list
+                The list to split.
+            chunk_size: int
+                The size of each chunk.
+        returns:
+            list: A list of lists.
+        """
+        result = []
+        for i in range(0, len(input_list), chunk_size):
+            result.append(input_list[i:i + chunk_size])
+            
+        return result
     def rename_columns(self, df: pd.DataFrame, new_col_names: list) -> pd.DataFrame:
         """
         Rename columns in a pandas dataframe.
@@ -117,3 +132,23 @@ def build_filter(self, attributes, exact_match=False):
         clean = self._string_mongo_list(filter_dict)
         logging.debug(f"Filter cleaned: {clean}")
         return clean
+    
+    def extract_field(self, api_results:list, field_name:str) -> list:
+        """
+        This function is used to extract a field from the API results.
+        params:
+            api_results: list
+                A list of dictionaries.
+            field_name: str
+                The name of the field to extract.
+        returns:
+            list: A list of IDs.
+        """
+        field_list = []
+        for item in api_results:
+            if type(item[field_name]) == str:
+                field_list.append(item[field_name])
+            elif type(item[field_name]) == list:
+                for another_item in item[field_name]:
+                    field_list.append(another_item)
+        return field_list
diff --git a/nmdc_api_utilities/example_usage.py b/nmdc_api_utilities/example_usage.py
@@ -0,0 +1,36 @@
+from nmdc_api_utilities.data_processing import DataProcessing
+from nmdc_api_utilities.data_object_search import DataObjectSearch
+from nmdc_api_utilities.workflow_execution_search import WorkflowExecutionSearch
+
+dos_client = DataObjectSearch()
+
+dp_client = DataProcessing()
+
+# Using the DataObjectSearch class to get records from the DataObject collection. We are looking for records with the attribute 'data_object_type' equal to 'FT ICR-MS Analysis Results'. 
+# We want to get the first 100 records and we want to include the fields 'id', 'md5_checksum', and 'url' in the results. We also want to get all pages of results.
+processed_nom = dos_client.get_record_by_attribute(attribute_name='data_object_type', attribute_value='FT ICR-MS Analysis Results', max_page_size=100, fields='id,md5_checksum,url', all_pages=True)
+# clarify names
+for dataobject in processed_nom:
+    dataobject["processed_nom_id"] = dataobject.pop("id")
+    dataobject["processed_nom_md5_checksum"] = dataobject.pop("md5_checksum")
+    dataobject["processed_nom_url"] = dataobject.pop("url")
+
+# convert to df
+processed_nom_df = dp_client.convert_to_df(processed_nom)
+print(processed_nom_df.head())
+# Next, we query the WorkflowExecution collection. To do so, we need to create an instance of it
+we_client = WorkflowExecutionSearch()
+# use utility function to get a list of the ids from processed_nom
+result_ids = dp_client.extract_field(processed_nom, "processed_nom_id")
+# Using the WorkflowExecutionSearch class to get records from the WorkflowExecution collection. We are looking for records with the attribute 'has_output' equal to the list of ids we got from the previous step.
+# We use the get_batch_records method to identify records that include any of the ids from the input list, matching the 'has_output' field.
+analysis_dataobj = we_client.get_batch_records(id_list=result_ids, search_field="has_output", fields="id,has_input,has_output", chunk_size=100)
+
+# clarify names
+for dataobject in analysis_dataobj:
+    dataobject["analysis_id"] = dataobject.pop("id")
+    dataobject["analysis_has_input"] = dataobject.pop("has_input")
+    dataobject["analysis_has_output"] = dataobject.pop("has_output")
+
+# convert to data frame
+analysis_dataobj_df = dp_client.convert_to_df(analysis_dataobj)
diff --git a/nmdc_api_utilities/test/test_collection.py b/nmdc_api_utilities/test/test_collection.py
@@ -47,6 +47,7 @@ def test_check_ids_exist(self):
     def test_check_ids_exist_multiple(self):
         # simple test to check if the check_ids_exist method returns a boolean
         ids = ["nmdc:bsm-11-002vgm56","nmdc:bsm-11-006pnx90","nmdc:bsm-11-00dkyf35","nmdc:bsm-11-00hrxp98","nmdc:bsm-11-00m15h97","nmdc:bsm-11-00yhef97","nmdc:bsm-11-011z7z70","nmdc:bsm-11-0169zs66","nmdc:bsm-11-01bbrr08","nmdc:bsm-11-01f6m423","nmdc:bsm-11-01g9wf51","nmdc:bsm-11-01jah904","nmdc:bsm-11-01teww33","nmdc:bsm-11-01vt2q72","nmdc:bsm-11-024rsd62","nmdc:bsm-11-02kcw433","nmdc:bsm-11-02n85875","nmdc:bsm-11-02v78297","nmdc:bsm-11-02x97z84","nmdc:bsm-11-034x5t48"]
+        # ids = ['nmdc:bsm-11-002vgm56','nmdc:bsm-11-006pnx90']
         collection = CollectionSearch("biosample_set",env=ENV)
         results = collection.check_ids_exist(ids)
         assert results == True
diff --git a/nmdc_api_utilities/test/test_notebooks.py b/nmdc_api_utilities/test/test_notebooks.py
@@ -0,0 +1,35 @@
+from nmdc_api_utilities.data_processing import DataProcessing
+from nmdc_api_utilities.data_object_search import DataObjectSearch
+from nmdc_api_utilities.workflow_execution_search import WorkflowExecutionSearch
+def test_nom_notebook():
+    
+    dos_client = DataObjectSearch()
+    
+    dp_client = DataProcessing()
+    processed_nom = dos_client.get_record_by_attribute(attribute_name='data_object_type', attribute_value='FT ICR-MS Analysis Results', max_page_size=100, fields='id,md5_checksum,url', all_pages=True)
+    # clarify names
+    for dataobject in processed_nom:
+        dataobject["processed_nom_id"] = dataobject.pop("id")
+        dataobject["processed_nom_md5_checksum"] = dataobject.pop("md5_checksum")
+        dataobject["processed_nom_url"] = dataobject.pop("url")
+
+    # convert to df
+    processed_nom_df = dp_client.convert_to_df(processed_nom)
+
+    # since we are querying the WorkflowExecution collection, we need to create an instance of it
+    we_client = WorkflowExecutionSearch()
+    # use utility function to get a list of the ids from processed_nom
+    result_ids = dp_client.extract_field(processed_nom, "processed_nom_id")
+    # get the analysis data objects
+    analysis_dataobj = we_client.get_batch_records(id_list=result_ids, search_field="has_output", fields="id,has_input,has_output", chunk_size=100)
+
+    # clarify names
+    for dataobject in analysis_dataobj:
+        dataobject["analysis_id"] = dataobject.pop("id")
+        dataobject["analysis_has_input"] = dataobject.pop("has_input")
+        dataobject["analysis_has_output"] = dataobject.pop("has_output")
+
+    # convert to data frame
+    analysis_dataobj_df = dp_client.convert_to_df(analysis_dataobj)
+    assert analysis_dataobj_df.shape[0] > 2000 
+test_nom_notebook()
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,9 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "nmdc_api_utilities"
-version = "0.3.6"
+
+version = "0.3.7"
+
 description = "A Python library for general research functions using NMDC APIs"
 authors = [
     { name = "Olivia Hess", email = "olivia.hess@pnnl.gov" },