Skip to content

ENH: Enable output datasets to store checkpoints #911

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
24 changes: 24 additions & 0 deletions hi-ml-azure/.vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,30 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": false
},
{
"name": "Output datasets",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/testazure/job_with_output_dataset.py",
"console": "integratedTerminal",
"justMyCode": false
},
{
"name": "Use output datasets",
"type": "python",
"request": "launch",
"program": "${workspaceFolder}/testazure/use_run_with_output_dataset.py",
"console": "integratedTerminal",
"justMyCode": false,
},
{
"name": "Run example script in AzureML",
"type": "python",
Expand Down
5 changes: 5 additions & 0 deletions hi-ml-azure/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,8 @@ test_all: pip_test call_flake8 call_mypy call_pytest_and_coverage
example: pip_local
echo 'edit src/health/azure/examples/elevate_this.py to reference your compute_cluster_name'
cd src/health/azure/examples; python elevate_this.py --azureml --message 'running example from makefile'

# Create conda environment
env:
conda env remove -n himl-azure
conda env create -f environment.yml
17 changes: 17 additions & 0 deletions hi-ml-azure/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,20 @@ name: himl-azure
dependencies:
- pip=20.1.1
- python=3.7.3
- pip:
- azure-ai-ml>=1.1.1
- azureml-core>=1.42.0
- azureml-dataset-runtime[fuse]>=1.42.0
- azureml-mlflow>=1.42.0
- azure-storage-blob==12.10.0
- azureml-tensorboard>=1.42.0
- azureml-train-core>=1.42.0
- conda-merge>=0.1.5
- mlflow>=1.29.0
- pandas>=1.3.4
- param>=1.12
- protobuf<4.0
- pysocks>=1.5.8
- ruamel.yaml>=0.16.12
- tensorboard>=2.6.0
- typing-extensions>=4.3.0
9 changes: 7 additions & 2 deletions hi-ml-azure/src/health_azure/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ def __init__(
use_mounting: Optional[bool] = None,
target_folder: Optional[PathOrString] = None,
local_folder: Optional[PathOrString] = None,
register_on_job_completion: bool = True,
):
"""
:param name: The name of the dataset, as it was registered in the AzureML workspace. For output datasets,
Expand All @@ -338,6 +339,9 @@ def __init__(
:param local_folder: The folder on the local machine at which the dataset is available. This
is used only for runs outside of AzureML. If this is empty then the target_folder will be used to
mount or download the dataset.
:param register_on_job_completion: Only for output datasets when using AML SDK v1: If this flag is True, the
dataset will be registered in the AML portal after the job has completed and visible in the "Data" section.
If this flag is False, the dataset will be visible for the job, but not in the AML portal "Data" section.
"""
# This class would be a good candidate for a dataclass, but having an explicit constructor makes
# documentation tools in the editor work nicer.
Expand All @@ -354,6 +358,7 @@ def __init__(
if str(self.target_folder) == ".":
raise ValueError("Can't mount or download a dataset to the current working directory.")
self.local_folder = Path(local_folder) if local_folder else None
self.register_on_job_completion = register_on_job_completion

def to_input_dataset_local(
self,
Expand Down Expand Up @@ -463,8 +468,8 @@ def to_output_dataset(self, workspace: Workspace, dataset_index: int) -> OutputF
dataset = OutputFileDatasetConfig(
name=_output_dataset_key(index=dataset_index), destination=(datastore, self.name + "/")
)
# TODO: Can we get tags into here too?
dataset = dataset.register_on_complete(name=self.name)
if self.register_on_job_completion:
dataset = dataset.register_on_complete(name=self.name)
if self.target_folder:
raise ValueError("Output datasets can't have a target_folder set.")
use_mounting = True if self.use_mounting is None else self.use_mounting
Expand Down
26 changes: 17 additions & 9 deletions hi-ml-azure/src/health_azure/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from azure.ai.ml.entities import Job
from azure.ai.ml.entities import Workspace as WorkspaceV2
from azure.ai.ml.entities import Environment as EnvironmentV2
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
from azure.core.credentials import TokenCredential
from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundError
from azure.identity import (
Expand Down Expand Up @@ -1917,7 +1918,7 @@ def _get_legitimate_interactive_browser_credential() -> Optional[TokenCredential
return None


def get_credential() -> Optional[TokenCredential]:
def get_credential() -> TokenCredential:
"""
Get a credential for authenticating with Azure. There are multiple ways to retrieve a credential.
If environment variables pertaining to details of a Service Principal are available, those will be used
Expand All @@ -1926,9 +1927,10 @@ def get_credential() -> Optional[TokenCredential]:
device code (which requires the user to visit a link and enter a provided code). If this fails, or if running in
Azure, DefaultAzureCredential will be used which iterates through a number of possible authentication methods
including identifying an Azure managed identity, cached credentials from VS code, Azure CLI, Powershell etc.
Otherwise returns None.
If not of those works, a ValueError is raised.

:return: Any of the aforementioned credentials if available, else None.
:raises ValueError: If no credential can be retrieved.
"""
service_principal_id = get_secret_from_environment(ENV_SERVICE_PRINCIPAL_ID, allow_missing=True)
tenant_id = get_secret_from_environment(ENV_TENANT_ID, allow_missing=True)
Expand All @@ -1938,17 +1940,23 @@ def get_credential() -> Optional[TokenCredential]:
return _get_legitimate_service_principal_credential(tenant_id, service_principal_id, service_principal_password)

try:
# When running in AzureML, this will also try managed identity.
cred = _get_legitimate_default_credential()
if cred is not None:
return cred
except ClientAuthenticationError:
cred = _get_legitimate_device_code_credential()
if cred is not None:
return cred

cred = _get_legitimate_interactive_browser_credential()
if cred is not None:
return cred
if is_running_in_azure_ml():
# In AzureML, we can try the AzureMLOnBehalfOfCredential credential. This credential does not need
# to be validated (in fact, it raises errors when we try to validate it by getting a token)
return AzureMLOnBehalfOfCredential()
else:
# Outside of AzureML, try any of the interactive authentication methods
cred = _get_legitimate_device_code_credential()
if cred is not None:
return cred
cred = _get_legitimate_interactive_browser_credential()
if cred is not None:
return cred

raise ValueError(
"Unable to generate and validate a credential. Please see Azure ML documentation"
Expand Down
41 changes: 41 additions & 0 deletions hi-ml-azure/testazure/job_with_output_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from datetime import datetime
from pathlib import Path
import sys
import uuid


src_root = Path(__file__).parents[1] / "src"
sys.path.append(str(src_root))

from health_azure import submit_to_azure_if_needed, DatasetConfig


def main():
# Define the output dataset
output_dataset = DatasetConfig(
# The dataset name will also be the name of the folder in the datastore
timestamp = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
suffix = uuid.uuid4().hex[:6],
name=f"joboutputs-{timestamp}-{suffix}",
datastore='workspaceblobstore',
)

# Submit the script to Azure if needed
run_info = submit_to_azure_if_needed(
snapshot_root_directory=Path(__file__).parents[1],
output_datasets=[output_dataset],
compute_cluster_name="lite-testing-ds2",
submit_to_azureml=True,
strictly_aml_v1=True,
)

output_folder = run_info.output_datasets[0]
print(f"Output folder: {output_folder}")
output_file = output_folder / "output.txt"
output_file.write_text('Hello, world!')

print("Done!")


if __name__ == "__main__":
main()
77 changes: 77 additions & 0 deletions hi-ml-azure/testazure/use_run_with_output_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
from pathlib import Path
import sys

from azure.ai.ml import MLClient
from azure.ai.ml.identity import AzureMLOnBehalfOfCredential

src_root = Path(__file__).parents[1] / "src"
sys.path.append(str(src_root))

from health_azure.himl import submit_to_azure_if_needed
from health_azure.utils import get_ml_client, get_workspace, get_credential
from azure.storage.blob import BlobServiceClient


def main() -> None:
# Check out if we can get the credential
credential = AzureMLOnBehalfOfCredential()
try:
credential.get_token("https://management.azure.com/.default")
except Exception:
print("Failed to get the credential")
uri = os.environ["MLFLOW_TRACKING_URI"]
uri_segments = uri.split("/")
subscription_id = uri_segments[uri_segments.index("subscriptions") + 1]
resource_group_name = uri_segments[uri_segments.index("resourceGroups") + 1]
workspace_name = uri_segments[uri_segments.index("workspaces") + 1]
credential = AzureMLOnBehalfOfCredential()
ml_client = MLClient(
credential=credential,
subscription_id=subscription_id,
resource_group_name=resource_group_name,
workspace_name=workspace_name,
)
print("Got the client")

run_id = "sincere_yacht_xjz95gwvq8"
workspace = get_workspace()
run = workspace.get_run(run_id)
if hasattr(run, "output_datasets"):
print(run.output_datasets)
else:
print("No output datasets")

job = ml_client.jobs.get(run_id)
output_dataset = job.outputs["OUTPUT_0"]

datastore = ml_client.datastores.get("workspaceblobstore")
print(datastore.account_name)
print(datastore.container_name)
account_url = f"{datastore.protocol}://{datastore.account_name}.blob.{datastore.endpoint}"
print(f"{output_dataset.path}")

blob_client = BlobServiceClient(account_url=account_url, credential=credential)
container_client = blob_client.get_container_client(datastore.container_name)

# List all blobs (files) inside a specific folder (prefix)
paths_parts = output_dataset.path.split("/paths/")
assert len(paths_parts) == 2
folder_name = paths_parts[1]
blob_list = [blob.name for blob in container_client.list_blobs(name_starts_with=folder_name)]
print(f"Files in {folder_name}:")
for blob_name in blob_list:
print(blob_name)

# Get the client without further authentication.
ml_client2 = get_ml_client()


if __name__ == "__main__":
submit_to_azure_if_needed(
snapshot_root_directory=Path(__file__).parents[2],
compute_cluster_name="lite-testing-ds2",
strictly_aml_v1=True,
submit_to_azureml=True,
)
main()