Clay-foundation · yellowcap · Jul 15, 2024 · Jul 18, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/cluster/ml-cluster.yaml.template b/cluster/ml-cluster.yaml.template
@@ -0,0 +1,61 @@
+Region: us-east-2
+
+# DL AMI
+Image:
+  Os: ubuntu2004
+  CustomAmi: <ami-id>
+
+# FSx LUSTRE SHARED STORAGE
+SharedStorage:
+  - MountDir: /fsx
+    Name: fsx
+    StorageType: FsxLustre
+    FsxLustreSettings:
+      FileSystemId: <fsx-id>
+
+# HEAD NODE
+HeadNode:
+  InstanceType: c5.12xlarge
+  Networking:
+    SubnetId: <subnet-public-id>
+    SecurityGroups:
+    - <sg-id>  # EFA enabled SG
+  Ssh:
+    KeyName: <ssh-key>
+  LocalStorage:
+    RootVolume:
+      Size: 200
+  Iam:
+    S3Access:
+      - BucketName: <read-data-mount>
+        EnableWriteAccess: false
+      - BucketName: <write-data-mount>
+        EnableWriteAccess: true
+
+
+# SCHEDULER
+Scheduling:
+  Scheduler: slurm
+  SlurmQueues:
+  - Name: gpu-queue
+    ComputeResources:
+    - Name: <g-series or p-series>
+      Instances:
+      - InstanceType: <type>
+      MinCount: 0
+      MaxCount: 8
+      Efa:
+        Enabled: true
+    Networking:
+      SubnetIds:
+      - <subnet-private-id>
+      SecurityGroups:
+      - <sg-id>  # EFA enabled SG
+      PlacementGroup:
+        Enabled: true
+    Iam:
+      S3Access:
+        - BucketName: <read-data-mount>
+          EnableWriteAccess: false
+        - BucketName: <write-data-mount>
+          EnableWriteAccess: true
diff --git a/configs/classify_eurosat.yaml b/configs/classify_eurosat.yaml
@@ -2,12 +2,12 @@
 seed_everything: 42
 data:
   metadata_path: configs/metadata.yaml
-  batch_size: 256
+  batch_size: 128
   num_workers: 8
 model:
   num_classes: 10
-  ckpt_path: checkpoints/clay-v1-base.ckpt
-  lr: 1e-4
+  ckpt_path: checkpoints/v1.5.0-no-mrl-dinov2/mae_v1.5.0_epoch-07_val-loss-0.1718.ckpt
+  lr: 5e-5
   wd: 0.05
   b1: 0.9
   b2: 0.95
@@ -28,6 +28,7 @@ trainer:
       init_args:
         entity: developmentseed
         project: clay-classify
+        group: v1.5-test
         log_model: false
   callbacks:
     - class_path: lightning.pytorch.callbacks.ModelCheckpoint

diff --git a/configs/config.yaml b/configs/config.yaml
@@ -1,56 +1,63 @@
-# lightning.pytorch==2.1.2
-seed_everything: 42
+seed_everything: 108
 data:
-  data_dir: data
-  size: 224
+  data_dir: /fsx
+  size: 256
   metadata_path: configs/metadata.yaml
   platforms:
     - landsat-c2l1
     - landsat-c2l2-sr
     - linz
+    - modis
     - naip
     - sentinel-1-rtc
     - sentinel-2-l2a
-  batch_size: 8
-  num_workers: 8
+  batch_size: 1
+  num_workers: 12
 model:
-  model_size: base
+  model_size: large
   mask_ratio: 0.75
-  norm_pix_loss: True
+  norm_pix_loss: False
   patch_size: 8
   shuffle: True
   metadata_path: configs/metadata.yaml
-  teacher: vit_base_patch16_224.dino
-  lr: 1e-5
+  teacher: vit_large_patch14_reg4_dinov2.lvd142m
+  dolls: [16, 32, 64, 128, 256, 768, 1024]
+  doll_weights: [1, 1, 1, 1, 1, 1, 1]
+  lr: 5e-6
   wd: 0.05
   b1: 0.9
   b2: 0.95
   embeddings_level: mean
 trainer:
-  accelerator: auto
+  accelerator: gpu
   strategy: ddp
-  devices: auto
-  num_nodes: 1
+  devices: 8
+  num_nodes: 48
   precision: bf16-mixed
-  log_every_n_steps: 10
-  max_epochs: 200
+  log_every_n_steps: 1
+  max_epochs: 1000
   accumulate_grad_batches: 1
-  default_root_dir: s3://clay-model-ckpt/v1.0.0/
+  default_root_dir: checkpoints/v1.5.0/
   fast_dev_run: False
   num_sanity_val_steps: 0
   use_distributed_sampler: False
+  limit_train_batches: 0.99
+  limit_val_batches: 0.99
   logger:
     - class_path: lightning.pytorch.loggers.WandbLogger
       init_args:
         entity: developmentseed
         project: clay
+        group: v1.5-nomrl-dinov2
+        id: 0uy3in7l
+        resume: must
         log_model: false
   callbacks:
     - class_path: lightning.pytorch.callbacks.ModelCheckpoint
       init_args:
-        dirpath: s3://clay-model-ckpt/v1.0.0/
+        dirpath: checkpoints/v1.5.0/
         auto_insert_metric_name: False
-        filename: mae_v1.0.0_epoch-{epoch:02d}_val-loss-{val/loss:.4f}
+        filename: mae_v1.5.0_epoch-{epoch:02d}_val-loss-{val/loss:.4f}
         monitor: val/loss
         mode: min
         save_last: True
@@ -63,4 +70,4 @@ trainer:
     - class_path: src.callbacks_wandb.LogIntermediatePredictions
   plugins:
     - class_path: lightning.pytorch.plugins.io.AsyncCheckpointIO
-ckpt_path: null
+ckpt_path: checkpoints/v1.5.0/last.ckpt
diff --git a/configs/metadata.yaml b/configs/metadata.yaml
@@ -176,11 +176,50 @@ sentinel-1-rtc:
   gsd: 10
   bands:
     mean:
-      vv: 0.123273
-      vh: 0.027337
+      vv: -12.113
+      vh: -18.673
     std:
-      vv: 1.492154
-      vh: 0.122182
+      vv: 8.314
+      vh: 8.017
     wavelength:
       vv: 3.5
       vh: 4.0
+modis:
+  band_order:
+    - sur_refl_b01
+    - sur_refl_b02
+    - sur_refl_b03
+    - sur_refl_b04
+    - sur_refl_b05
+    - sur_refl_b06
+    - sur_refl_b07
+  rgb_indices:
+    - 0
+    - 3
+    - 2
+  gsd: 500
+  bands:
+    mean:
+      sur_refl_b01: 1072.
+      sur_refl_b02: 1624.
+      sur_refl_b03: 931.
+      sur_refl_b04: 1023.
+      sur_refl_b05: 1599.
+      sur_refl_b06: 1404.
+      sur_refl_b07: 1051.
+    std:
+      sur_refl_b01: 1643.
+      sur_refl_b02: 1878.
+      sur_refl_b03: 1449.
+      sur_refl_b04: 1538.
+      sur_refl_b05: 1763.
+      sur_refl_b06: 1618.
+      sur_refl_b07: 1396.
+    wavelength:
+      sur_refl_b01: .645
+      sur_refl_b02: .858
+      sur_refl_b03: .469
+      sur_refl_b04: .555
+      sur_refl_b05: 1.240
+      sur_refl_b06: 1.640
+      sur_refl_b07: 2.130
diff --git a/configs/segment_chesapeake.yaml b/configs/segment_chesapeake.yaml
@@ -6,17 +6,17 @@ data:
   val_chip_dir: data/cvpr/ny/val/chips/
   val_label_dir: data/cvpr/ny/val/labels/
   metadata_path: configs/metadata.yaml
-  batch_size: 40
+  batch_size: 16
   num_workers: 8
   platform: naip
 model:
   num_classes: 7
   feature_maps:
-    - 3
     - 5
-    - 7
     - 11
-  ckpt_path: checkpoints/clay-v1-base.ckpt
+    - 15
+    - 23
+  ckpt_path: checkpoints/v1.5.0-no-mrl-dinov2/mae_v1.5.0_epoch-05_val-loss-0.1734.ckpt
   lr: 1e-5
   wd: 0.05
   b1: 0.9
@@ -38,6 +38,7 @@ trainer:
       init_args:
         entity: developmentseed
         project: clay-segment
+        group: v1.5-test
         log_model: false
   callbacks:
     - class_path: lightning.pytorch.callbacks.ModelCheckpoint

diff --git a/copy_data.sh b/copy_data.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Define source and destination directories
+src="/fsx"
+dest="data/pretrain"
+
+# Create the destination directory if it doesn't exist
+mkdir -p "$dest"
+
+# Find all directories in the source directory
+find "$src" -type d -print0 | while IFS= read -r -d '' dir; do
+    # Create corresponding directory in the destination
+    newdir="$dest${dir#$src}"
+    mkdir -p "$newdir"
+
+    # Copy the first 100 files from the source directory to the new directory
+    find "$dir" -maxdepth 1 -type f -print0 | head -z -n 100 | xargs -0 -I{} cp {} "$newdir"
+done
diff --git a/docs/release-notes/data_sampling.md b/docs/release-notes/data_sampling.md
@@ -112,6 +112,67 @@ and a maximum of 2000 scenes for each catalog that was included.
 We selected the latest imagery for each of the available regions
 of new zealand. The list of catalogs is in the linz processor file.
 
+### MODIS sampling strategy
+
+For MODIS we used the [Surface Reflectance 8-Day (500m)](https://planetarycomputer.microsoft.com/dataset/modis-09A1-061)
+product. The data is distributed in SIN grid tiles. We included all SIN grid
+tiles that do not have any nodata inside. The selected SIN grid tiles are then
+transform to EPSG:3857 for all tiles. This results in some variation between the
+nominal resolution, although the original resolution from the SIN projection is
+500 meters. For input to the model, we assumed the 500m resolution as a fixed
+resolution size for all tiles.
+
+Algorithm to determine which tiles do not have nodata is shown in the code block
+below. This resulted in 233 SIN grid tiles to be selected. For each of these
+we sampled the first STAC search result for each month in each year from 2018
+until 2023. This therefore resulted in 72 (`6 years * 12 months`) separate scenes
+for each of the 233 SIN grid tiles.
+
+Script for selection of SIN grid tiles included in the sampling:
+
+```python
+from multiprocessing import Pool
+import rasterio
+import planetary_computer as pc
+import pystac_client
+import numpy as np
+
+SIN_GRID_TILES = []
+for i in SIN_VERTICAL_RANGE:
+    for j in SIN_HORIZONTAL_RANGE:
+        SIN_GRID_TILES.append((i, j))
+
+def evaluate_nodata(i, j):
+    catalog = pystac_client.Client.open(STAC_API, modifier=pc.sign_inplace)
+    items = catalog.search(
+        collections=[COLLECTION],
+        query={
+            "modis:vertical-tile": {
+                "eq": i,
+            },
+            "modis:horizontal-tile": {
+                "eq": j,
+            },
+        },
+        max_items=1,
+    )
+    item = list(items.item_collection())[0]
+
+    with rasterio.open(item.assets["sur_refl_b01"].href) as src:
+        data = src.read()
+
+    nodata = np.sum(data == -28672)
+
+    if nodata == 0:
+        print(i, j)
+        return i, j
+
+if __name__ == '__main__':
+    with Pool(16) as p:
+        indexes = p.starmap(evaluate_nodata, SIN_GRID_TILES)
+    print("done")
+    print(indexes)
+```
 
 ## Data preparation
 
@@ -136,6 +197,7 @@ Using stacchip, we created a dataset with a size of 33.8 TB of imagery, with abo
 | Landsat-c2l1    | 5827333 |
 | Landsat-c2l2-sr | 5790651 |
 | Sentinel-1-rtc | 16133394 |
+| MODIS          |  1350864 |
 
 # Older versions
 

diff --git a/embeddings/Dockerfile b/embeddings/Dockerfile
@@ -0,0 +1,45 @@
+FROM 763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-inference:2.3.0-gpu-py311-cu121-ubuntu20.04-ec2
+
+WORKDIR /model
+
+RUN git clone -b all-of-naip https://github.com/Clay-foundation/model.git .
+
+RUN aws s3 cp --no-sign-request s3://clay-model-ckpt/v1.5.0-no-mrl-dinov2/mae_v1.5.0_epoch-07_val-loss-0.1718.ckpt data/mae_v1.5.0_epoch-07_val-loss-0.1718.ckpt
+RUN aws s3 cp --no-sign-request s3://clay-mgrs-samples/naip-manifest.txt.zip data/naip-manifest.txt.zip
+RUN aws s3 cp --no-sign-request s3://clay-mgrs-samples/element84-tiles-2023.gz data/element84-tiles-2023.gz
+
+RUN pip install \
+  einops~=0.7.0 \
+  fiona~=1.9.5 \
+  geopandas~=0.14.1 \
+  jsonargparse~=4.27.0 \
+  lightning~=2.1.0 \
+  matplotlib~=3.9.0 \
+  planetary-computer~=1.0.0 \
+  python-box~=7.1.0 \
+  pyarrow~=15.0.2 \
+  rasterio~=1.3.10 \
+  s3fs~=2024.6.0 \
+  boto3~=1.34.122 \
+  botocore~=1.34.122 \
+  scikit-image~=0.22.0 \
+  scikit-learn~=1.4.0 \
+  stackstac~=0.5.0 \
+  timm~=0.9.16 \
+  transformers~=4.35.2 \
+  typeshed-client~=2.4.0 \
+  vit-pytorch~=1.6.4 \
+  zarr~=2.16.1 \
+  geoarrow-pyarrow==0.1.2 \
+  torchdata==0.7.1 \
+  stacchip==0.1.35 \
+  wandb==0.17.5 \
+  rio_stac~=0.10.0
+
+RUN git pull && git checkout ceecb6138705cb28a5f4d3f61f22b19a2f625edb
+
+# Move file to home directory so that relative imports work
+RUN cp embeddings/all-naip.py .
+RUN cp embeddings/all-sentinel.py .
+
+ENTRYPOINT ["python"]