Skip to content

Add unrestricted_use_only and surveillance_use_only constructor params #724

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fc6c2ba
WIP: dev support for unrestricted_use_only, surveillance_use_only
leehart Feb 7, 2025
dfdd4e2
Add test sample sets for Af1 with unrestricted_use_only. Add relevant…
leehart Feb 18, 2025
1b6cbb9
Update comment re skipping test due to lack of relevant fixtures
leehart Feb 18, 2025
02921c9
Add surveillance flags to sample_metadata(). Add tests.
leehart Feb 20, 2025
d4af40a
Merge branch 'master' into GH716_add_constructor_params
leehart Feb 20, 2025
d4e7e70
Merge branch 'master' into GH716_add_constructor_params
leehart Mar 18, 2025
19902b0
WIP: add _prep_sample_query_param() stub where _prep_sample_set_param()
leehart Mar 20, 2025
d7b8383
Add logic to _prep_sample_query_param() to honour self._surveillance_…
leehart Mar 21, 2025
de0daf8
Merge branch 'master' into GH716_add_constructor_params
leehart Mar 21, 2025
435e8a7
Allow _prep_sample_query_param() to return None
leehart Mar 21, 2025
bde3d4e
Return consistent data type from _prep_sample_query_param()
leehart Mar 21, 2025
bfed3f4
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 8, 2025
78d26d1
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 8, 2025
6396126
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 24, 2025
50b3f5c
Add new public_url param to sample_metadata tests
leehart Apr 24, 2025
23e9012
Merge branch 'master' into GH716_add_constructor_params
leehart Apr 29, 2025
ea950fc
Merge branch 'master' into GH716_add_constructor_params
leehart May 1, 2025
fdebfd4
WIP: dev support for unrestricted_use_only, surveillance_use_only params
leehart May 23, 2025
62a848e
Merge branch 'master' into GH716_add_constructor_params
leehart May 23, 2025
d125707
WIP: amend data types
leehart May 23, 2025
a9f44c4
Add doc for _surveillance_flags sample_sets param
leehart May 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 33 additions & 8 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
unrestricted_use_only=False,
surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
Expand Down Expand Up @@ -127,18 +129,23 @@ def __init__(
virtual_contigs=None,
gene_names=None,
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

def __repr__(self):
text = (
f"<MalariaGEN Af1 API client>\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self.releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self._available_releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"Relevant data releases : {', '.join(self.releases)}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
Expand Down Expand Up @@ -172,7 +179,7 @@ def _repr_html_(self):
<th style="text-align: left">
Data releases available
</th>
<td>{', '.join(self.releases)}</td>
<td>{', '.join(self._available_releases)}</td>
</tr>
<tr>
<th style="text-align: left">
Expand Down Expand Up @@ -204,6 +211,24 @@ def _repr_html_(self):
</th>
<td>{self.client_location}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for unrestricted use only
</th>
<td>{self._unrestricted_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for surveillance use only
</th>
<td>{self._surveillance_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Relevant data releases
</th>
<td>{', '.join(self.releases)}</td>
</tr>
</tbody>
</table>
"""
Expand Down
73 changes: 63 additions & 10 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ def __init__(
discordant_read_calls_analysis=None,
pre=False,
tqdm_class=None,
unrestricted_use_only=False,
surveillance_use_only=False,
**storage_options, # used by fsspec via init_filesystem()
):
super().__init__(
Expand Down Expand Up @@ -195,6 +197,8 @@ def __init__(
virtual_contigs=VIRTUAL_CONTIGS,
gene_names=GENE_NAMES,
inversion_tag_path=INVERSION_TAG_PATH,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
)

# set up caches
Expand All @@ -206,21 +210,24 @@ def v3_wild(self):
3.0 release, excluding the lab crosses."""
return [
x
for x in self.sample_sets(release="3.0")["sample_set"].tolist()
for x in self._available_sample_sets(release="3.0")["sample_set"].tolist()
if x != "AG1000G-X"
]

def __repr__(self):
text = (
f"<MalariaGEN Ag3 API client>\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self.releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"AIM analysis : {self._aim_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Storage URL : {self._url}\n"
f"Data releases available : {', '.join(self._available_releases)}\n"
f"Results cache : {self._results_cache}\n"
f"Cohorts analysis : {self._cohorts_analysis}\n"
f"AIM analysis : {self._aim_analysis}\n"
f"Site filters analysis : {self._site_filters_analysis}\n"
f"Software version : malariagen_data {malariagen_data.__version__}\n"
f"Client location : {self.client_location}\n"
f"Data filtered to unrestricted use only: {self._unrestricted_use_only}\n"
f"Data filtered to surveillance use only: {self._surveillance_use_only}\n"
f"Relevant data releases : {', '.join(self.releases)}\n"
f"---\n"
f"Please note that data are subject to terms of use,\n"
f"for more information see https://www.malariagen.net/data\n"
Expand Down Expand Up @@ -254,7 +261,7 @@ def _repr_html_(self):
<th style="text-align: left">
Data releases available
</th>
<td>{', '.join(self.releases)}</td>
<td>{', '.join(self._available_releases)}</td>
</tr>
<tr>
<th style="text-align: left">
Expand Down Expand Up @@ -292,6 +299,24 @@ def _repr_html_(self):
</th>
<td>{self.client_location}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for unrestricted use only
</th>
<td>{self._unrestricted_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Data filtered for surveillance use only
</th>
<td>{self._surveillance_use_only}</td>
</tr>
<tr>
<th style="text-align: left">
Relevant data releases
</th>
<td>{', '.join(self.releases)}</td>
</tr>
</tbody>
</table>
"""
Expand Down Expand Up @@ -339,6 +364,34 @@ def cross_metadata(self):
debug("drop 'phenotype' column, not used")
df.drop("phenotype", axis="columns", inplace=True)

# Identify the crosses sample set.
# Note: this sample set identifier is also hard-coded in `v3_wild()`.
crosses_sample_set = "AG1000G-X"

# If `_unrestricted_use_only` is `True`, then only return data if the crosses sample set has `unrestricted_use` set to `True`.
if (
self._unrestricted_use_only
and not self._sample_set_has_unrestricted_use(
sample_set=crosses_sample_set
)
):
# Remove all the data from the DataFrame and reset its index.
df = df.iloc[0:0].reset_index(drop=True)

# If `_surveillance_use_only` is `True`, then only return samples that have `is_surveillance` set to `True`.
if self._surveillance_use_only:
crosses_surveillance_flags_df = self._surveillance_flags(
sample_sets=[crosses_sample_set]
)
df = df.merge(
crosses_surveillance_flags_df[["sample_id", "is_surveillance"]],
on="sample_id",
how="left",
)
df = df[df["is_surveillance"]]
df = df.drop(columns=["is_surveillance"])

# Cache the cross metadata.
self._cache_cross_metadata = df

return self._cache_cross_metadata.copy()
Expand Down
69 changes: 51 additions & 18 deletions malariagen_data/anoph/aim_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,31 +138,64 @@ def aim_calls(
) -> xr.Dataset:
self._require_aim_analysis()

# Normalise parameters.
aims = self._prep_aims_param(aims=aims)
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
# Prepare parameters.
prepared_aims = self._prep_aims_param(aims=aims)
del aims
prepared_sample_sets = self._prep_sample_sets_param(sample_sets=sample_sets)
del sample_sets

# Access SNP calls and concatenate multiple sample sets and/or regions.
ly = []
for s in sample_sets_prepped:
y = self._aim_calls_dataset(
aims=aims,
sample_set=s,
prepared_sample_query = self._prep_sample_query_param(sample_query=sample_query)
del sample_query

# Start a list of AIM calls Datasets, one for each sample set.
aim_calls_datasets = []

# For each sample set...
for sample_set in prepared_sample_sets:
# Get the AIM calls for all samples in the set, as a Xarray Dataset.
aim_calls_dataset = self._aim_calls_dataset(
aims=prepared_aims,
sample_set=sample_set,
)
ly.append(y)

# Add this Dataset to the list.
aim_calls_datasets.append(aim_calls_dataset)

# Concatenate data from multiple sample sets.
ds = simple_xarray_concat(ly, dim=DIM_SAMPLE)
ds = simple_xarray_concat(aim_calls_datasets, dim=DIM_SAMPLE)

# Handle sample query.
if sample_query is not None:
df_samples = self.sample_metadata(sample_sets=sample_sets_prepped)
# If there's a sample query...
if prepared_sample_query is not None:
# Get the relevant sample metadata.
df_samples = self.sample_metadata(sample_sets=prepared_sample_sets)

# If there are no sample query options, then default to an empty dict.
sample_query_options = sample_query_options or {}
loc_samples = df_samples.eval(sample_query, **sample_query_options).values

# Determine which samples match the sample query.
loc_samples = df_samples.eval(
prepared_sample_query, **sample_query_options
).values

# Raise an error if no samples match the sample query.
if np.count_nonzero(loc_samples) == 0:
raise ValueError(f"No samples found for query {sample_query!r}")
ds = ds.isel(samples=loc_samples)
raise ValueError(
f"No samples found for query {prepared_sample_query!r}"
)

# Get the relevant sample ids from the sample metadata DataFrame, using the boolean mask.
relevant_sample_ids = df_samples.loc[loc_samples, "sample_id"].values

# Get all the sample ids from the unfiltered AIM calls Dataset.
ds_sample_ids = ds.coords["sample_id"].values

# Get the indices of samples in the AIM calls Dataset that match the relevant sample ids.
# Note: we use `[0]` to get the first element of the tuple returned by `np.where`.
relevant_sample_indices = np.where(
np.isin(ds_sample_ids, relevant_sample_ids)
)[0]

# Select only the relevant samples from the AIM calls Dataset.
ds = ds.isel(samples=relevant_sample_indices)

return ds

Expand Down
Loading
Loading