-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdvc.yaml.jinja
More file actions
120 lines (103 loc) · 4.44 KB
/
dvc.yaml.jinja
File metadata and controls
120 lines (103 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
vars:
- params.yaml
- datasets:
{%- for dataset in datasets %}
- name: {{ dataset.name }}
aws_prefix: {{ dataset.aws_prefix }}
{%- endfor %}
- models:
{%- for model in models %}
- name: {{ model.name }}
aws_prefix: {{ model.aws_prefix }}
dockerfile: {{ model.dockerfile }}
{%- endfor %}
stages:
setup:
cmd:
- mkdir -p logs ${destination.output_dir} ${destination.metric_dir}
- echo "Created local directories" > logs/setup.txt
outs:
- logs/setup.txt
upload_to_s3:
cmd:
- aws s3 cp ${source.datasets_dir}/ s3://${aws.s3_training_data_prefix}/datasets --recursive --exclude "*" --include "*.zip"
- aws s3 cp ${source.models_dir}/ s3://${aws.s3_training_data_prefix}/models --recursive --exclude "*" --include "README.md"
- echo "Upload completed at $(date)" > logs/s3_upload_complete.txt
deps:
- logs/setup.txt
- ${source.datasets_dir}/
- ${source.models_dir}/
outs:
- logs/s3_upload_complete.txt:
cache: true
deploy_to_ecr:
matrix:
model: ${models}
cmd:
- aws ecr describe-repositories --repository-names ${item.model.name} --region ${aws.region_name} >/dev/null 2>&1 || aws ecr create-repository --repository-name ${item.model.name} --region ${aws.region_name} >/dev/null
- aws ecr get-login-password --region ${aws.region_name} | docker login --username AWS --password-stdin ${aws.account_id}.dkr.ecr.${aws.region_name}.amazonaws.com
- docker buildx build --build-arg GIT_CACHE_BUST=${git.git_cache_bust} --platform linux/amd64,linux/arm64 --secret id=git_auth,src=../git-auth.txt -f ${item.model.dockerfile} -t ${aws.account_id}.dkr.ecr.${aws.region_name}.amazonaws.com/${item.model.name}:latest ../../.. --push
- echo "ECR push completed at $(date)" > logs/ecr_push_complete.txt
- echo "${aws.account_id}.dkr.ecr.${aws.region_name}.amazonaws.com/${item.model.name}:latest" > logs/image_uri.txt
deps:
- logs/setup.txt
- ${item.model.dockerfile}
outs:
- logs/ecr_push_complete.txt:
cache: true
- logs/image_uri.txt:
cache: true
create_training_job:
matrix:
dataset: ${datasets}
model: ${models}
cmd: >
uv run pg2-benchmark sagemaker create-training-job
--model-name ${item.model.name}
--region-name ${aws.region_name}
--sagemaker-role-name ${aws.sagemaker_role_name}
--ecr-repository-uri ${aws.account_id}.dkr.ecr.${aws.region_name}.amazonaws.com/${item.model.name}
--s3-training-data-prefix ${aws.s3_training_data_prefix}
--s3-output-prefix ${aws.s3_output_prefix}
--instance-type ${aws.instance_type}
--volume-size ${aws.volume_size}
--dataset-prefix ${item.dataset.aws_prefix}
--model-prefix ${item.model.aws_prefix}
> logs/create_job_${item.dataset.name}_${item.model.name}.txt
deps:
- ../../../src/pg2_benchmark/cli/sagemaker.py
- logs/s3_upload_complete.txt
- logs/ecr_push_complete.txt
- logs/image_uri.txt
outs:
- logs/create_job_${item.dataset.name}_${item.model.name}.txt:
cache: true
monitor_training_job:
matrix:
dataset: ${datasets}
model: ${models}
cmd: >
uv run pg2-benchmark sagemaker monitor-training-job
--region-name ${aws.region_name}
--job-name $(cat logs/create_job_${item.dataset.name}_${item.model.name}.txt)
> logs/monitor_job_${item.dataset.name}_${item.model.name}.txt
deps:
- ../../../src/pg2_benchmark/cli/sagemaker.py
- logs/create_job_${item.dataset.name}_${item.model.name}.txt
outs:
- logs/monitor_job_${item.dataset.name}_${item.model.name}.txt:
cache: true
calculate_metric:
matrix:
dataset: ${datasets}
model: ${models}
cmd:
- aws s3 cp s3://${aws.s3_output_prefix}/$(cat logs/create_job_${item.dataset.name}_${item.model.name}.txt)/output/model.tar.gz ${destination.output_dir}/
- tar -xzf ${destination.output_dir}/model.tar.gz -C ${destination.output_dir}/
- rm ${destination.output_dir}/model.tar.gz
- uv run pg2-benchmark metric calc --output-path ${destination.output_dir}/${item.dataset.name}_${item.model.name}.csv --metric-path ${destination.metric_dir}/${item.dataset.name}_${item.model.name}.csv
deps:
- logs/monitor_job_${item.dataset.name}_${item.model.name}.txt
outs:
- ${destination.metric_dir}/${item.dataset.name}_${item.model.name}.csv:
cache: true