Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/resource_sources.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ gnomAD data is available through `multiple cloud providers' public datasets prog
The functions in the :doc:`gnomad.resources </api_reference/resources/index>` package can be configured to load data from different sources.

If Hail determines that is is running in a cloud provider's Spark environment, resources will default to being read from that cloud provider's datasets program.
For example, resource will be read from Azure Open Datasets if Hail determines that it is running on an Azure HDInsight cluster.
Otherwise, resources will default to being read from Google Cloud Public Datasets.
This can be configured using the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable.

Expand Down
3 changes: 0 additions & 3 deletions gnomad/resources/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class GnomadPublicResourceSource(Enum):
GNOMAD = "gnomAD"
GOOGLE_CLOUD_PUBLIC_DATASETS = "Google Cloud Public Datasets"
REGISTRY_OF_OPEN_DATA_ON_AWS = "Registry of Open Data on AWS"
AZURE_OPEN_DATASETS = "Azure Open Datasets"


def get_default_public_resource_source() -> Union[GnomadPublicResourceSource, str]:
Expand All @@ -25,7 +24,6 @@ def get_default_public_resource_source() -> Union[GnomadPublicResourceSource, st

- If the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable is set, use the source configured there.
- Otherwise, if Hail determines that is is running in a cloud provider's Spark environment, use the source from that cloud provider.
For example, use Azure Open Datasets if running on an Azure HDInsight cluster.
- Otherwise, use Google Cloud Public Datasets.

:returns: Default resource source
Expand Down Expand Up @@ -54,7 +52,6 @@ def get_default_public_resource_source() -> Union[GnomadPublicResourceSource, st
cloud_spark_provider = guess_cloud_spark_provider()
default_resource_sources_by_provider = {
"dataproc": GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS,
"hdinsight": GnomadPublicResourceSource.AZURE_OPEN_DATASETS,
}
if cloud_spark_provider:
try:
Expand Down
3 changes: 0 additions & 3 deletions gnomad/resources/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,9 +592,6 @@ def _get_path(self) -> str:
if resource_source == GnomadPublicResourceSource.REGISTRY_OF_OPEN_DATA_ON_AWS:
return f"s3a://gnomad-public-us-east-1{relative_path}"

if resource_source == GnomadPublicResourceSource.AZURE_OPEN_DATASETS:
return f"wasbs://dataset@datasetgnomad.blob.core.windows.net{relative_path}"

return (
f"{resource_source.rstrip('/')}{relative_path}" # pylint: disable=no-member
)
Expand Down
18 changes: 1 addition & 17 deletions tests/resources/test_resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,6 @@ class TestDefaultPublicResourceSource:
GnomadPublicResourceSource.REGISTRY_OF_OPEN_DATA_ON_AWS,
"s3a://gnomad-public-us-east-1/example.ht",
),
(
GnomadPublicResourceSource.AZURE_OPEN_DATASETS,
"wasbs://dataset@datasetgnomad.blob.core.windows.net/example.ht",
),
(
"gs://my-bucket/gnomad-resources",
"gs://my-bucket/gnomad-resources/example.ht",
Expand Down Expand Up @@ -154,7 +150,6 @@ def test_read_from_default_source(self, default_source, expected_path):
"Registry of Open Data on AWS",
GnomadPublicResourceSource.REGISTRY_OF_OPEN_DATA_ON_AWS,
),
("Azure Open Datasets", GnomadPublicResourceSource.AZURE_OPEN_DATASETS),
("gs://my-bucket/gnomad-resources", "gs://my-bucket/gnomad-resources"),
],
)
Expand All @@ -172,7 +167,6 @@ def test_get_default_source_from_environment(
"cloud_spark_provider,expected_default_source",
[
("dataproc", GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS),
("hdinsight", GnomadPublicResourceSource.AZURE_OPEN_DATASETS),
("unknown", GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS),
(None, GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious why the None is being dropped too since it looks like we would default to google in this case?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, I thought this was something Cursor added -- will add back

],
Expand All @@ -198,7 +192,7 @@ def test_default_source_from_environment_overrides_cloud_spark_provider(self):
with (
patch(
"hail.utils.guess_cloud_spark_provider",
return_value="hdinsight",
return_value="dataproc",
create=True,
),
patch.dict(
Expand Down Expand Up @@ -255,16 +249,6 @@ def gnomad_public_resource_test_parameters(
GnomadPublicResourceSource.REGISTRY_OF_OPEN_DATA_ON_AWS,
f"s3a://gnomad-public-us-east-1{path}",
),
(
f"gs://gnomad-public{path}",
GnomadPublicResourceSource.AZURE_OPEN_DATASETS,
f"wasbs://dataset@datasetgnomad.blob.core.windows.net{path}",
),
(
f"gs://gnomad-public-requester-pays{path}",
GnomadPublicResourceSource.AZURE_OPEN_DATASETS,
f"wasbs://dataset@datasetgnomad.blob.core.windows.net{path}",
),
(
f"gs://gnomad-public{path}",
"gs://my-bucket/gnomad-resources",
Expand Down