broadinstitute
diff --git a/‎404.html‎
Lines changed: 1 addition & 5 deletions b/‎404.html‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎assets/create_starrynight_example.sh‎
Lines changed: 39 additions & 25 deletions b/‎assets/create_starrynight_example.sh‎
Lines changed: 39 additions & 25 deletions
diff --git a/‎assets/parse_yaml.py‎
Lines changed: 109 additions & 0 deletions b/‎assets/parse_yaml.py‎
Lines changed: 109 additions & 0 deletions
@@ -67,11 +67,7 @@
 </li>
 
 <li>
-    <a href="/user/illumination-correction/" class="dropdown-item">Illumination Correction</a>
-</li>
-                                    
-<li>
-    <a href="/user/segmentation-check/" class="dropdown-item">Segmentation Check</a>
+    <a href="/user/example-pipeline-cli/" class="dropdown-item">Example CLI Pipeline</a>
 </li>
                                 </ul>
                             </li>
 
@@ -4,123 +4,137 @@ PROJECT=XXXXXX
 BATCH=XXXXXX
 
 export S3_PATH="s3://${BUCKET}/projects/${PROJECT}/${BATCH}"
+export INPUT_DIR='./scratch/starrynight_example_input'
+export OUTPUT_BASELINE_DIR='./scratch/starrynight_example_output_baseline'
 
 # Inputs
 
 ## SBS images
 
-parallel mkdir -p scratch/starrynight_example/Source1/Batch1/images/Plate1/20X_c{1}_SBS-{1}/ ::: 1 2 3 4 5 6 7 8 9 10
+parallel mkdir -p ${INPUT_DIR}/Source1/Batch1/images/Plate1/20X_c{1}_SBS-{1}/ ::: 1 2 3
 
 parallel --match '.*' --match '(.*) (.*) (.*)' \
     aws s3 cp "${S3_PATH}/images/Plate1/20X_c{1}_SBS-{1}/Well{2.1}_Point{2.1}_{2.2}_ChannelC,A,T,G,DAPI_Seq{2.3}.ome.tiff" \
-    "scratch/starrynight_example/Source1/Batch1/images/Plate1/20X_c{1}_SBS-{1}/" ::: \
+    "${INPUT_DIR}/Source1/Batch1/images/Plate1/20X_c{1}_SBS-{1}/" ::: \
     1 2 3 ::: \
     "A1 0000 0000" "A1 0001 0001" "A2 0000 1025" "A2 0001 1026" "B1 0000 3075" "B1 0001 3076"
 
 ## Cell Painting images
 
-mkdir -p scratch/starrynight_example/Source1/Batch1/images/20X_CP_Plate1_20240319_122800_179
+mkdir -p ${INPUT_DIR}/Source1/Batch1/images/20X_CP_Plate1_20240319_122800_179
 
 parallel --match '(.*) (.*) (.*)' \
    aws s3 cp "${S3_PATH}/images/Plate1/20X_CP_Plate1_20240319_122800_179/Well{1.1}_Point{1.1}_{1.2}_ChannelPhalloAF750,ZO1-AF488,DAPI_Seq{1.3}.ome.tiff" \
-   "scratch/starrynight_example/Source1/Batch1/images/Plate1/20X_CP_Plate1_20240319_122800_179/" ::: \
+   "${INPUT_DIR}/Source1/Batch1/images/Plate1/20X_CP_Plate1_20240319_122800_179/" ::: \
    "A1 0000 0000" "A1 0001 0001" "A2 0000 1025" "A2 0001 1026" "B1 0000 3075" "B1 0001 3076"
 
 # Outputs
 
 ## Illumination correction images
 
-mkdir -p scratch/starrynight_example/Source1/Batch1/illum/Plate1
+mkdir -p ${OUTPUT_BASELINE_DIR}/Source1/Batch1/illum/Plate1
 parallel \
-   aws s3 cp "${S3_PATH}/illum/Plate1/Plate1_Cycle{1}_Illum{2}.npy" "scratch/starrynight_example/Source1/Batch1/illum/Plate1/" ::: \
+   aws s3 cp "${S3_PATH}/illum/Plate1/Plate1_Cycle{1}_Illum{2}.npy" "${OUTPUT_BASELINE_DIR}/Source1/Batch1/illum/Plate1/" ::: \
    1 2 3 ::: \
    DNA A T G C
 
+parallel \
+   aws s3 cp "${S3_PATH}/illum/Plate1/Plate1_Illum{1}.npy" "${OUTPUT_BASELINE_DIR}/Source1/Batch1/illum/Plate1/" ::: \
+   DNA Phalloidin ZO1
 
 ## Cell Painting images: Illumination corrected
 
 parallel \
    aws s3 cp "${S3_PATH}/images_corrected/painting/Plate1-Well{1}/Plate_Plate1_Well_Well{1}_Site_{2}_Corr{3}.tiff" \
-   "scratch/starrynight_example/Source1/Batch1/images_corrected/painting/Plate1-Well{1}/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_corrected/painting/Plate1-Well{1}/" ::: \
    A1 A2 B1 ::: 0 1 ::: DNA Phalloidin ZO1
 
-
 parallel \
    aws s3 cp "${S3_PATH}/images_corrected/painting/Plate1-Well{1}/PaintingIllumApplication_{2}.csv" \
-   "scratch/starrynight_example/Source1/Batch1/images_corrected/painting/Plate1-Well{1}/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_corrected/painting/Plate1-Well{1}/" ::: \
    A1 A2 B1 ::: Cells ConfluentRegions Experiment Image Nuclei
 
 # SBS images: Illumination aligned
 
 parallel \
    aws s3 cp "${S3_PATH}/images_aligned/barcoding/Plate1-Well{1}-{2}/Plate_Plate1_Well_{1}_Site_{2}_Cycle0{3}_{4}.tiff" \
-   "scratch/starrynight_example/Source1/Batch1/images_aligned/barcoding/Plate1-Well{1}-{2}/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_aligned/barcoding/Plate1-Well{1}-{2}/" ::: \
    A1 A2 B1 ::: 0 1 ::: 1 2 3 ::: A T G C DAPI
 
 
 parallel \
    aws s3 cp "${S3_PATH}/images_aligned/barcoding/Plate1-Well{1}-{2}/BarcodingApplication_{3}.csv" \
-   "scratch/starrynight_example/Source1/Batch1/images_aligned/barcoding/Plate1-Well{1}-{2}/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_aligned/barcoding/Plate1-Well{1}-{2}/" ::: \
    A1 A2 B1 ::: 0 1 ::: Experiment Image
 
 ## SBS images: Illumination corrected
 
 parallel \
    aws s3 cp "${S3_PATH}/images_corrected/barcoding/Plate1-Well{1}-{2}/Plate_Plate1_Well_{1}_Site_{2}_Cycle0{3}_{4}.tiff" \
-   "scratch/starrynight_example/Source1/Batch1/images_corrected/barcoding/Plate1-Well{1}-{2}/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_corrected/barcoding/Plate1-Well{1}-{2}/" ::: \
    A1 A2 B1 ::: 0 1 ::: 1 2 3 ::: A T G C
 
 parallel \
    aws s3 cp "${S3_PATH}/images_corrected/barcoding/Plate1-Well{1}-{2}/Plate_Plate1_Well_{1}_Site_{2}_Cycle0{3}_{4}.tiff" \
-   "scratch/starrynight_example/Source1/Batch1/images_corrected/barcoding/Plate1-Well{1}-{2}/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_corrected/barcoding/Plate1-Well{1}-{2}/" ::: \
    A1 A2 B1 ::: 0 1 ::: 1 ::: DAPI
-# DAPI is only present in the first cycle
+# DAPI is present only in the first cycle
 
 
 parallel \
    aws s3 cp "${S3_PATH}/images_corrected/barcoding/Plate1-Well{1}-{2}/BarcodePreprocessing_{3}.csv" \
-   "scratch/starrynight_example/Source1/Batch1/images_corrected/barcoding/Plate1-Well{1}-{2}/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_corrected/barcoding/Plate1-Well{1}-{2}/" ::: \
    A1 A2 B1 ::: 0 1 ::: BarcodeFoci PreFoci Experiment Image Nuclei
 
 ## Segmentation images
 
 parallel \
    aws s3 cp "${S3_PATH}/images_segmentation/Plate1/Plate_Plate1_Well_Well{1}_Site_{2}_Corr{3}_SegmentCheck.png" \
-   "scratch/starrynight_example/Source1/Batch1/images_segmentation/Plate1/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_segmentation/Plate1/" ::: \
    A1 A2 B1 ::: 0 ::: DNA
-# Notice the odd naming of Well
 
 parallel \
    aws s3 cp "${S3_PATH}/images_segmentation/Plate1/Plate_Plate1_Well_Well{1}_Site_{2}_Corr{3}_SegmentCheck.png" \
-   "scratch/starrynight_example/Source1/Batch1/images_segmentation/Plate1/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_segmentation/Plate1/" ::: \
    A1 A2 B1 ::: 0 ::: DNA
 
 parallel \
    aws s3 cp "${S3_PATH}/images_segmentation/Plate1/SegmentationCheck_{1}.csv" \
-   "scratch/starrynight_example/Source1/Batch1/images_segmentation/Plate1/" ::: \
+   "${OUTPUT_BASELINE_DIR}/Source1/Batch1/images_segmentation/Plate1/" ::: \
    Experiment Image Nuclei Cells PreCells ConfluentRegions
 
-
 ## Load Data CSVs
 
 export S3_PATH_WORKSPACE="s3://${BUCKET}/projects/${PROJECT}/workspace"
 
 aws s3 sync \
    "${S3_PATH_WORKSPACE}/load_data_csv/${BATCH}/Plate1/" \
-   "scratch/starrynight_example/Source1/workspace_example/load_data_csv/Batch1/Plate1/"
-
+   "${OUTPUT_BASELINE_DIR}/Source1/workspace_example/load_data_csv/Batch1/Plate1/"
 
 ## Analysis CSVs
 
 parallel \
    aws s3 sync \
    "${S3_PATH_WORKSPACE}/analysis/${BATCH}/Plate1-Well{1}-{2}/" \
-   "scratch/starrynight_example/Source1/workspace_example/analysis/${BATCH}/Plate1-Well{1}-{2}/" \
+   "${OUTPUT_BASELINE_DIR}/Source1/workspace_example/analysis/${BATCH}/Plate1-Well{1}-{2}/" \
    --exclude \""*.csv\"" ::: \
    A1 A2 B1 ::: 0 1
 
 
 parallel \
-   aws s3 sync "${S3_PATH_WORKSPACE}/analysisfix/${BATCH}/Plate1-Well{1}-{2}/" \
-   "scratch/starrynight_example/Source1/workspace_example/analysis/${BATCH}/Plate1-Well{1}-{2}/" ::: \
+   aws s3 sync \
+   "${S3_PATH_WORKSPACE}/analysisfix/${BATCH}/Plate1-Well{1}-{2}/" \
+   "${OUTPUT_BASELINE_DIR}/Source1/workspace_example/analysis/${BATCH}/Plate1-Well{1}-{2}/" ::: \
    A1 A2 B1 ::: 0 1
+# Note that the analysis files are synced from two different locations: analysis and analysisfix.
+# analysisfix was a rerun of analysis
+
+# Compress files to reduce disk usage after downloading
+echo "Compressing files to reduce disk usage..."
+
+## Compress all TIFF files
+find ${OUTPUT_BASELINE_DIR} -type f -name "*.tiff" | parallel 'magick {} -compress jpeg -quality 80 {}'
+find ${INPUT_DIR} -type f -name "*.tiff" | parallel 'magick {} -compress jpeg -quality 80 {}'
+
+# Compress CSV files
+find ${OUTPUT_BASELINE_DIR} -type f -name "*.csv" | parallel 'gzip -9 {}'
@@ -0,0 +1,109 @@
+import yaml
+import os
+import csv
+
+
+def get_file_size(path):
+    """Get file size in bytes or None if file doesn't exist."""
+    try:
+        return os.path.getsize(path) if os.path.exists(path) else None
+    except Exception as e:
+        print(f"Error getting file size for {path}: {e}")
+        return None
+
+
+def read_csv_headers(file_path, max_headers=20):
+    """Read headers from a CSV file and return the first max_headers.
+    If there are more than max_headers, the last item will indicate how many more columns exist."""
+    try:
+        with open(file_path, "r", newline="") as csvfile:
+            reader = csv.reader(csvfile)
+            headers = next(reader, [])
+
+            # If we have more headers than max_headers, add a summary entry
+            if len(headers) > max_headers:
+                remaining = len(headers) - (max_headers - 1)
+                return headers[: max_headers - 1] + [f"{remaining} more columns"]
+            else:
+                return headers
+    except Exception as e:
+        print(f"Error reading CSV headers from {file_path}: {e}")
+        return []
+
+
+def build_file_paths(yaml_data):
+    """Process YAML and return structure with full paths and file sizes."""
+    result = {}
+
+    for section_name, section_data in yaml_data.items():
+        result[section_name] = section_data.copy()
+        base_path = section_data["path"]
+
+        if "files" in section_data:
+            for set_name, folders in section_data["files"].items():
+                result[section_name]["files"][set_name] = []
+
+                for folder_item in folders:
+                    folder_path = os.path.join(base_path, folder_item["folder"])
+                    processed_folder = {"folder": folder_item["folder"], "files": []}
+
+                    for file_item in folder_item["files"]:
+                        if isinstance(file_item, dict):
+                            # Handle file type groups (csv, tiff, png)
+                            processed_types = {}
+                            for file_type, file_list in file_item.items():
+                                processed_types[file_type] = []
+                                for file_name in file_list:
+                                    full_path = os.path.join(folder_path, file_name)
+                                    file_info = {
+                                        "path": full_path,
+                                        "size": get_file_size(full_path),
+                                    }
+
+                                    # Add headers for CSV files
+                                    if (
+                                        file_type.lower() == "csv"
+                                        or file_name.lower().endswith(".csv")
+                                    ):
+                                        file_info["headers"] = read_csv_headers(
+                                            full_path
+                                        )
+
+                                    processed_types[file_type].append(file_info)
+
+                            processed_folder["files"].append(processed_types)
+                        else:
+                            # Handle direct file paths
+                            full_path = os.path.join(folder_path, file_item)
+                            file_info = {
+                                "path": full_path,
+                                "size": get_file_size(full_path),
+                            }
+
+                            # Add headers for CSV files
+                            if file_item.lower().endswith(".csv"):
+                                file_info["headers"] = read_csv_headers(full_path)
+
+                            processed_folder["files"].append(file_info)
+
+                    result[section_name]["files"][set_name].append(processed_folder)
+
+    return result
+
+
+def main():
+    # Parse YAML and build full paths with sizes
+    with open("sample_files.yaml", "r") as f:
+        yaml_data = yaml.safe_load(f)
+
+    processed_data = build_file_paths(yaml_data)
+
+    # Save processed data
+    with open("sample_files_parsed.yaml", "w") as f:
+        yaml.dump(processed_data, f, default_flow_style=False, sort_keys=False)
+
+    print("Processed YAML file has been saved to: sample_files_parsed.yaml")
+
+
+if __name__ == "__main__":
+    main()