microsoft · ant0nsc · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/hi-ml-azure/.vscode/launch.json b/hi-ml-azure/.vscode/launch.json
@@ -4,6 +4,30 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Output datasets",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/testazure/job_with_output_dataset.py",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Use output datasets",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/testazure/use_run_with_output_dataset.py",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+        },
         {
             "name": "Run example script in AzureML",
             "type": "python",

diff --git a/hi-ml-azure/Makefile b/hi-ml-azure/Makefile
@@ -86,3 +86,8 @@ test_all: pip_test call_flake8 call_mypy call_pytest_and_coverage
 example: pip_local
 	echo 'edit src/health/azure/examples/elevate_this.py to reference your compute_cluster_name'
 	cd src/health/azure/examples; python elevate_this.py --azureml --message 'running example from makefile'
+
+# Create conda environment
+env:
+	conda env remove -n himl-azure
+	conda env create -f environment.yml
diff --git a/hi-ml-azure/environment.yml b/hi-ml-azure/environment.yml
@@ -4,3 +4,20 @@ name: himl-azure
 dependencies:
   - pip=20.1.1
   - python=3.7.3
+  - pip:
+    - azure-ai-ml>=1.1.1
+    - azureml-core>=1.42.0
+    - azureml-dataset-runtime[fuse]>=1.42.0
+    - azureml-mlflow>=1.42.0
+    - azure-storage-blob==12.10.0
+    - azureml-tensorboard>=1.42.0
+    - azureml-train-core>=1.42.0
+    - conda-merge>=0.1.5
+    - mlflow>=1.29.0
+    - pandas>=1.3.4
+    - param>=1.12
+    - protobuf<4.0
+    - pysocks>=1.5.8
+    - ruamel.yaml>=0.16.12
+    - tensorboard>=2.6.0
+    - typing-extensions>=4.3.0
diff --git a/hi-ml-azure/src/health_azure/datasets.py b/hi-ml-azure/src/health_azure/datasets.py
@@ -318,6 +318,7 @@ def __init__(
         use_mounting: Optional[bool] = None,
         target_folder: Optional[PathOrString] = None,
         local_folder: Optional[PathOrString] = None,
+        register_on_job_completion: bool = True,
     ):
         """
         :param name: The name of the dataset, as it was registered in the AzureML workspace. For output datasets,
@@ -338,6 +339,9 @@ def __init__(
         :param local_folder: The folder on the local machine at which the dataset is available. This
             is used only for runs outside of AzureML. If this is empty then the target_folder will be used to
             mount or download the dataset.
+        :param register_on_job_completion: Only for output datasets when using AML SDK v1: If this flag is True, the
+            dataset will be registered in the AML portal after the job has completed and visible in the "Data" section.
+            If this flag is False, the dataset will be visible for the job, but not in the AML portal "Data" section.
         """
         # This class would be a good candidate for a dataclass, but having an explicit constructor makes
         # documentation tools in the editor work nicer.
@@ -354,6 +358,7 @@ def __init__(
         if str(self.target_folder) == ".":
             raise ValueError("Can't mount or download a dataset to the current working directory.")
         self.local_folder = Path(local_folder) if local_folder else None
+        self.register_on_job_completion = register_on_job_completion
 
     def to_input_dataset_local(
         self,
@@ -463,8 +468,8 @@ def to_output_dataset(self, workspace: Workspace, dataset_index: int) -> OutputF
         dataset = OutputFileDatasetConfig(
             name=_output_dataset_key(index=dataset_index), destination=(datastore, self.name + "/")
         )
-        # TODO: Can we get tags into here too?
-        dataset = dataset.register_on_complete(name=self.name)
+        if self.register_on_job_completion:
+            dataset = dataset.register_on_complete(name=self.name)
         if self.target_folder:
             raise ValueError("Output datasets can't have a target_folder set.")
         use_mounting = True if self.use_mounting is None else self.use_mounting

diff --git a/hi-ml-azure/src/health_azure/utils.py b/hi-ml-azure/src/health_azure/utils.py
@@ -47,6 +47,7 @@
 from azure.ai.ml.entities import Job
 from azure.ai.ml.entities import Workspace as WorkspaceV2
 from azure.ai.ml.entities import Environment as EnvironmentV2
+from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
 from azure.core.credentials import TokenCredential
 from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundError
 from azure.identity import (
@@ -1917,7 +1918,7 @@ def _get_legitimate_interactive_browser_credential() -> Optional[TokenCredential
         return None
 
 
-def get_credential() -> Optional[TokenCredential]:
+def get_credential() -> TokenCredential:
     """
     Get a credential for authenticating with Azure. There are multiple ways to retrieve a credential.
     If environment variables pertaining to details of a Service Principal are available, those will be used
@@ -1926,9 +1927,10 @@ def get_credential() -> Optional[TokenCredential]:
     device code (which requires the user to visit a link and enter a provided code). If this fails, or if running in
     Azure, DefaultAzureCredential will be used which iterates through a number of possible authentication methods
     including identifying an Azure managed identity, cached credentials from VS code, Azure CLI, Powershell etc.
-    Otherwise returns None.
+    If not of those works, a ValueError is raised.
 
     :return: Any of the aforementioned credentials if available, else None.
+    :raises ValueError: If no credential can be retrieved.
     """
     service_principal_id = get_secret_from_environment(ENV_SERVICE_PRINCIPAL_ID, allow_missing=True)
     tenant_id = get_secret_from_environment(ENV_TENANT_ID, allow_missing=True)
@@ -1938,17 +1940,23 @@ def get_credential() -> Optional[TokenCredential]:
         return _get_legitimate_service_principal_credential(tenant_id, service_principal_id, service_principal_password)
 
     try:
+        # When running in AzureML, this will also try managed identity.
         cred = _get_legitimate_default_credential()
         if cred is not None:
             return cred
     except ClientAuthenticationError:
-        cred = _get_legitimate_device_code_credential()
-        if cred is not None:
-            return cred
-
-        cred = _get_legitimate_interactive_browser_credential()
-        if cred is not None:
-            return cred
+        if is_running_in_azure_ml():
+            # In AzureML, we can try the AzureMLOnBehalfOfCredential credential. This credential does not need
+            # to be validated (in fact, it raises errors when we try to validate it by getting a token)
+            return AzureMLOnBehalfOfCredential()
+        else:
+            # Outside of AzureML, try any of the interactive authentication methods
+            cred = _get_legitimate_device_code_credential()
+            if cred is not None:
+                return cred
+            cred = _get_legitimate_interactive_browser_credential()
+            if cred is not None:
+                return cred
 
     raise ValueError(
         "Unable to generate and validate a credential. Please see Azure ML documentation"

diff --git a/hi-ml-azure/testazure/job_with_output_dataset.py b/hi-ml-azure/testazure/job_with_output_dataset.py
@@ -0,0 +1,41 @@
+from datetime import datetime
+from pathlib import Path
+import sys
+import uuid
+
+
+src_root = Path(__file__).parents[1] / "src"
+sys.path.append(str(src_root))
+
+from health_azure import submit_to_azure_if_needed, DatasetConfig
+
+
+def main():
+    # Define the output dataset
+    output_dataset = DatasetConfig(
+        # The dataset name will also be the name of the folder in the datastore
+        timestamp = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
+        suffix = uuid.uuid4().hex[:6],
+        name=f"joboutputs-{timestamp}-{suffix}",
+        datastore='workspaceblobstore',
+    )
+
+    # Submit the script to Azure if needed
+    run_info = submit_to_azure_if_needed(
+        snapshot_root_directory=Path(__file__).parents[1],
+        output_datasets=[output_dataset],
+        compute_cluster_name="lite-testing-ds2",
+        submit_to_azureml=True,
+        strictly_aml_v1=True,
+    )
+
+    output_folder = run_info.output_datasets[0]
+    print(f"Output folder: {output_folder}")
+    output_file = output_folder / "output.txt"
+    output_file.write_text('Hello, world!')
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/hi-ml-azure/testazure/use_run_with_output_dataset.py b/hi-ml-azure/testazure/use_run_with_output_dataset.py
@@ -0,0 +1,77 @@
+import os
+from pathlib import Path
+import sys
+
+from azure.ai.ml import MLClient
+from azure.ai.ml.identity import AzureMLOnBehalfOfCredential
+
+src_root = Path(__file__).parents[1] / "src"
+sys.path.append(str(src_root))
+
+from health_azure.himl import submit_to_azure_if_needed
+from health_azure.utils import get_ml_client, get_workspace, get_credential
+from azure.storage.blob import BlobServiceClient
+
+
+def main() -> None:
+    # Check out if we can get the credential
+    credential = AzureMLOnBehalfOfCredential()
+    try:
+        credential.get_token("https://management.azure.com/.default")
+    except Exception:
+        print("Failed to get the credential")
+    uri = os.environ["MLFLOW_TRACKING_URI"]
+    uri_segments = uri.split("/")
+    subscription_id = uri_segments[uri_segments.index("subscriptions") + 1]
+    resource_group_name = uri_segments[uri_segments.index("resourceGroups") + 1]
+    workspace_name = uri_segments[uri_segments.index("workspaces") + 1]
+    credential = AzureMLOnBehalfOfCredential()
+    ml_client = MLClient(
+        credential=credential,
+        subscription_id=subscription_id,
+        resource_group_name=resource_group_name,
+        workspace_name=workspace_name,
+    )
+    print("Got the client")
+
+    run_id = "sincere_yacht_xjz95gwvq8"
+    workspace = get_workspace()
+    run = workspace.get_run(run_id)
+    if hasattr(run, "output_datasets"):
+        print(run.output_datasets)
+    else:
+        print("No output datasets")
+
+    job = ml_client.jobs.get(run_id)
+    output_dataset = job.outputs["OUTPUT_0"]
+
+    datastore = ml_client.datastores.get("workspaceblobstore")
+    print(datastore.account_name)
+    print(datastore.container_name)
+    account_url = f"{datastore.protocol}://{datastore.account_name}.blob.{datastore.endpoint}"
+    print(f"{output_dataset.path}")
+
+    blob_client = BlobServiceClient(account_url=account_url, credential=credential)
+    container_client = blob_client.get_container_client(datastore.container_name)
+
+    # List all blobs (files) inside a specific folder (prefix)
+    paths_parts = output_dataset.path.split("/paths/")
+    assert len(paths_parts) == 2
+    folder_name = paths_parts[1]
+    blob_list = [blob.name for blob in container_client.list_blobs(name_starts_with=folder_name)]
+    print(f"Files in {folder_name}:")
+    for blob_name in blob_list:
+        print(blob_name)
+
+    # Get the client without further authentication.
+    ml_client2 = get_ml_client()
+
+
+if __name__ == "__main__":
+    submit_to_azure_if_needed(
+        snapshot_root_directory=Path(__file__).parents[2],
+        compute_cluster_name="lite-testing-ds2",
+        strictly_aml_v1=True,
+        submit_to_azureml=True,
+    )
+    main()