diff --git a/.github/workflows/full-stack-test.yml b/.github/workflows/full-stack-test.yml index 9c86181df81..28cb5623dd1 100644 --- a/.github/workflows/full-stack-test.yml +++ b/.github/workflows/full-stack-test.yml @@ -11,6 +11,7 @@ on: jobs: test: runs-on: ubuntu-latest + timeout-minutes: 30 steps: - name: Check out source @@ -21,23 +22,28 @@ jobs: python -m pip install --upgrade pip pip install . kubernetes - - name: Bring up the environment run: | echo "Starting environment in the background..." MINIKUBE_CPUS=2 metaflow-dev all-up & - # Give time to spin up. Adjust as needed: - WAIT_TIMEOUT=600 metaflow-dev wait-until-ready + WAIT_TIMEOUT=900 metaflow-dev wait-until-ready - name: Wait & run flow run: | - # When the environment is up, metaflow-dev shell will wait for readiness - # and then drop into a shell. We feed commands via a heredoc: cat </dev/null || true + echo "=== Recent tilt logs (last 50 lines) ===" + tail -50 /tmp/tilt.log 2>/dev/null || true + - name: Tear down environment + if: always() run: | metaflow-dev down diff --git a/.github/workflows/ux-tests.yml b/.github/workflows/ux-tests.yml index c092480cbf4..c094bd8560e 100644 --- a/.github/workflows/ux-tests.yml +++ b/.github/workflows/ux-tests.yml @@ -112,6 +112,12 @@ jobs: workers: 1 memory: 7168 timeout: 1200 + - backend: gcs-local + services: "minio,postgresql,metadata-service,fake-gcs-server" + workers: 4 + memory: 6144 + timeout: 900 + extra_args: '-k "not conda"' - backend: sfn-batch services: "minio,postgresql,metadata-service,localbatch,ddb-local,sfn-local" workers: 2 @@ -151,6 +157,10 @@ jobs: pip install --upgrade pip pip install -e ".[dev]" + - name: Install GCS dependencies + if: matrix.backend == 'gcs-local' + run: pip install google-cloud-storage + - name: Set up minikube uses: medyagh/setup-minikube@aba8d5ff1666d19b9549133e3b92e70d4fc52cb7 with: @@ -226,7 +236,7 @@ jobs: SERVICES: ${{ matrix.services }} - name: Pre-pull python:3.9 into minikube - if: matrix.backend != 'local' && matrix.backend != 'sfn-batch' + if: matrix.backend != 'local' && matrix.backend != 'sfn-batch' && matrix.backend != 'gcs-local' run: minikube image pull python:3.9 - name: Save minikube images to cache @@ -253,6 +263,12 @@ jobs: if: matrix.backend == 'airflow-kubernetes' run: devtools/ci/wait-airflow-api.sh + - name: Set GCS emulator environment + if: matrix.backend == 'gcs-local' + run: | + echo "METAFLOW_DEFAULT_DATASTORE=gs" >> $GITHUB_ENV + echo "STORAGE_EMULATOR_HOST=http://localhost:4443" >> $GITHUB_ENV + - name: Clean up completed pods and start background cleanup if: matrix.backend == 'airflow-kubernetes' run: | @@ -283,7 +299,8 @@ jobs: --cov-report=xml:coverage.xml \ --cov-report=html:htmlcov \ --cov-branch \ - --junit-xml=junit-${{ matrix.backend }}.xml + --junit-xml=junit-${{ matrix.backend }}.xml \ + ${{ matrix.extra_args || '' }} - name: Upload coverage data if: always() diff --git a/devtools/Tiltfile b/devtools/Tiltfile index cd903f51a69..7f64a11a377 100644 --- a/devtools/Tiltfile +++ b/devtools/Tiltfile @@ -30,6 +30,7 @@ components = { "ddb-local": [], "sfn-local": ["ddb-local"], "airflow": ["postgresql"], + "fake-gcs-server": [], } # --------------------------------------------------------------------------- @@ -93,6 +94,7 @@ load('./tilt/localbatch.tiltfile', 'setup_localbatch') load('./tilt/ddb_local.tiltfile', 'setup_ddb_local') load('./tilt/sfn_local.tiltfile', 'setup_sfn_local') load('./tilt/airflow.tiltfile', 'setup_airflow') +load('./tilt/fake_gcs_server.tiltfile', 'setup_fake_gcs_server') _SETUP = { "minio": setup_minio, @@ -104,6 +106,7 @@ _SETUP = { "ddb-local": setup_ddb_local, "sfn-local": setup_sfn_local, "airflow": setup_airflow, + "fake-gcs-server": setup_fake_gcs_server, } # --------------------------------------------------------------------------- diff --git a/devtools/tilt/fake_gcs_server.tiltfile b/devtools/tilt/fake_gcs_server.tiltfile new file mode 100644 index 00000000000..5cd6ecb08ae --- /dev/null +++ b/devtools/tilt/fake_gcs_server.tiltfile @@ -0,0 +1,22 @@ +load('./_result.tiltfile', 'new_result') + +def setup_fake_gcs_server(ctx): + k8s_yaml(read_file('./tilt/k8s/fake-gcs-server.yaml')) + k8s_yaml(read_file('./tilt/k8s/gcs-bucket-init-job.yaml')) + k8s_yaml(read_file('./tilt/k8s/fake-gcs-secret.yaml')) + + k8s_resource( + 'fake-gcs-server', + port_forwards=['4443:4443'], + links=[link('http://localhost:4443/storage/v1/b', 'fake-gcs-server buckets')], + labels=['fake-gcs-server'], + ) + + k8s_resource('gcs-bucket-init', resource_deps=['fake-gcs-server'], labels=['fake-gcs-server']) + + return new_result( + config={"METAFLOW_DATASTORE_SYSROOT_GS": "gs://metaflow-test/metaflow"}, + shell_env={"STORAGE_EMULATOR_HOST": "http://localhost:4443"}, + config_resources=['gcs-bucket-init'], + k8s_secrets=['fake-gcs-secret'], + ) diff --git a/devtools/tilt/k8s/fake-gcs-secret.yaml b/devtools/tilt/k8s/fake-gcs-secret.yaml new file mode 100644 index 00000000000..d499f10e238 --- /dev/null +++ b/devtools/tilt/k8s/fake-gcs-secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: fake-gcs-secret +type: Opaque +stringData: + STORAGE_EMULATOR_HOST: http://fake-gcs-server:4443 diff --git a/devtools/tilt/k8s/fake-gcs-server.yaml b/devtools/tilt/k8s/fake-gcs-server.yaml new file mode 100644 index 00000000000..2e09916e6d5 --- /dev/null +++ b/devtools/tilt/k8s/fake-gcs-server.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fake-gcs-server +spec: + replicas: 1 + selector: + matchLabels: + app: fake-gcs-server + template: + metadata: + labels: + app: fake-gcs-server + spec: + containers: + - name: fake-gcs-server + image: fsouza/fake-gcs-server:latest + # Port 4443 is the fake-gcs-server default; we keep it for + # compatibility even though we use -scheme http (not HTTPS). + args: ["-scheme", "http", "-host", "0.0.0.0", "-port", "4443"] + ports: + - containerPort: 4443 + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: fake-gcs-server +spec: + selector: + app: fake-gcs-server + ports: + - port: 4443 + targetPort: 4443 diff --git a/devtools/tilt/k8s/gcs-bucket-init-job.yaml b/devtools/tilt/k8s/gcs-bucket-init-job.yaml new file mode 100644 index 00000000000..421b0e56ee5 --- /dev/null +++ b/devtools/tilt/k8s/gcs-bucket-init-job.yaml @@ -0,0 +1,27 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: gcs-bucket-init +spec: + ttlSecondsAfterFinished: 120 + template: + spec: + restartPolicy: OnFailure + containers: + - name: init + image: curlimages/curl:latest + command: ["/bin/sh", "-ec"] + args: + - | + curl -sf -X POST \ + http://fake-gcs-server:4443/storage/v1/b \ + -H "Content-Type: application/json" \ + -d '{"name":"metaflow-test"}' + echo "Bucket 'metaflow-test' created successfully" + resources: + requests: + cpu: 25m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi diff --git a/metaflow/plugins/gcp/gs_storage_client_factory.py b/metaflow/plugins/gcp/gs_storage_client_factory.py index 1ec528a5a61..191a1a99d3f 100644 --- a/metaflow/plugins/gcp/gs_storage_client_factory.py +++ b/metaflow/plugins/gcp/gs_storage_client_factory.py @@ -12,12 +12,19 @@ def _get_gs_storage_client_default(): cache_key = _get_cache_key() if cache_key not in _client_cache: from google.cloud import storage - import google.auth - credentials, project_id = google.auth.default(scopes=storage.Client.SCOPE) - _client_cache[cache_key] = storage.Client( - credentials=credentials, project=project_id - ) + if os.environ.get("STORAGE_EMULATOR_HOST"): + # When a storage emulator is configured, create a plain Client() + # which auto-detects the emulator and uses anonymous credentials. + # Calling google.auth.default() would fail without real GCP creds. + _client_cache[cache_key] = storage.Client() + else: + import google.auth + + credentials, project_id = google.auth.default(scopes=storage.Client.SCOPE) + _client_cache[cache_key] = storage.Client( + credentials=credentials, project=project_id + ) return _client_cache[cache_key] diff --git a/test/ux/core/conftest.py b/test/ux/core/conftest.py index db1bc908567..1121cff985f 100644 --- a/test/ux/core/conftest.py +++ b/test/ux/core/conftest.py @@ -47,9 +47,37 @@ def _set_devstack_env(): os.environ.setdefault("AWS_ENDPOINT_URL_EVENTBRIDGE", "http://localhost:7777") +def _setup_gcs_emulator(): + """Configure the GCS client factory to use anonymous credentials. + + When STORAGE_EMULATOR_HOST is set, the google-cloud-storage Client + automatically uses anonymous credentials and routes requests to the + emulator -- but only when no explicit credentials are passed. + Metaflow's default GCP client provider calls google.auth.default() + first, which fails without real GCP credentials. We monkey-patch + the factory to return a plain Client() that auto-detects the emulator. + """ + if not os.environ.get("STORAGE_EMULATOR_HOST"): + return + + try: + from google.cloud import storage + from metaflow.plugins.gcp import gs_storage_client_factory as factory + + _emulator_client = storage.Client() + + def _get_emulator_client(): + return _emulator_client + + factory.get_gs_storage_client = _get_emulator_client + except ImportError: + pass + + def pytest_configure(config): """ Called early by pytest (before collection) so env vars are set before metaflow is imported at module level by the test files. """ _set_devstack_env() + _setup_gcs_emulator() diff --git a/test/ux/core/test_utils.py b/test/ux/core/test_utils.py index 031d25e90b0..db4e9153378 100644 --- a/test/ux/core/test_utils.py +++ b/test/ux/core/test_utils.py @@ -265,9 +265,15 @@ def verify_run_provenance(run: Run, decospecs: Any) -> None: start_task = run["start"].task ds_type = start_task.metadata_dict.get("ds-type") - # Only enforce the S3 check when the test environment uses a remote datastore. + # Only enforce the remote datastore check when the test environment uses one. # Local-only CI environments (METAFLOW_DEFAULT_DATASTORE=local) do not have MinIO. - if os.environ.get("METAFLOW_DEFAULT_DATASTORE", "") != "local": + default_ds = os.environ.get("METAFLOW_DEFAULT_DATASTORE", "") + if default_ds == "gs": + assert ds_type == "gs", ( + f"Expected datastore type 'gs' (GCS), got {ds_type!r}. " + f"Artifacts may be stored locally — check METAFLOW_HOME / METAFLOW_PROFILE." + ) + elif default_ds != "local": assert ds_type == "s3", ( f"Expected datastore type 's3' (MinIO), got {ds_type!r}. " f"Artifacts may be stored locally — check METAFLOW_HOME / METAFLOW_PROFILE." diff --git a/test/ux/ux_test_config.yaml b/test/ux/ux_test_config.yaml index 00b9e74cd1e..f46090d3948 100644 --- a/test/ux/ux_test_config.yaml +++ b/test/ux/ux_test_config.yaml @@ -22,6 +22,14 @@ backends: decospec: null enabled: true + # GCS local: runs flows locally but uses Google Cloud Storage (fake-gcs-server) + # instead of S3/MinIO for the datastore. Tests the GCS datastore backend. + - name: gcs-local + scheduler_type: null + cluster: null + decospec: null + enabled: true + # Argo Workflows + Kubernetes (devstack: minikube + argo-workflows) - name: argo-kubernetes scheduler_type: argo-workflows