Skip to content

Commit de0cdc3

Browse files
author
Adi
committed
Add gpu demos
cats and dogs with horovod + nvidia horovod benchmark
1 parent e683ca3 commit de0cdc3

File tree

6 files changed

+1519
-0
lines changed

6 files changed

+1519
-0
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# GPU Demos - Prerequisite"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"### In order to run the GPU demos you need to make sure that the systme has GPU and Horovod service is deployed<br>\n",
15+
"\n",
16+
"### Note that in version 2.3 the deployment of Horovod is being done by Iguazio support team and it's not part of the default deployment\n",
17+
"Please contact support for additional information"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": null,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": []
26+
}
27+
],
28+
"metadata": {
29+
"kernelspec": {
30+
"display_name": "Python 3",
31+
"language": "python",
32+
"name": "python3"
33+
},
34+
"language_info": {
35+
"codemirror_mode": {
36+
"name": "ipython",
37+
"version": 3
38+
},
39+
"file_extension": ".py",
40+
"mimetype": "text/x-python",
41+
"name": "python",
42+
"nbconvert_exporter": "python",
43+
"pygments_lexer": "ipython3",
44+
"version": "3.6.8"
45+
}
46+
},
47+
"nbformat": 4,
48+
"nbformat_minor": 2
49+
}
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"!pip install git+https://github.com/v3io/v3io-gputils"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 10,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"HOROVOD_JOB_NAME = \"horovod-tf-benchmark\""
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 11,
24+
"metadata": {},
25+
"outputs": [
26+
{
27+
"name": "stdout",
28+
"output_type": "stream",
29+
"text": [
30+
"{'apiVersion': 'kubeflow.org/v1alpha1',\n",
31+
" 'kind': 'MPIJob',\n",
32+
" 'metadata': {'creationTimestamp': '2019-07-01T18:22:26Z',\n",
33+
" 'generation': 1,\n",
34+
" 'name': 'horovod-tf-benchmark',\n",
35+
" 'namespace': 'default-tenant',\n",
36+
" 'resourceVersion': '1325438',\n",
37+
" 'selfLink': '/apis/kubeflow.org/v1alpha1/namespaces/default-tenant/mpijobs/horovod-tf-benchmark',\n",
38+
" 'uid': '2da132c2-9c2d-11e9-98d3-d8c4972b0204'},\n",
39+
" 'spec': {'replicas': 8,\n",
40+
" 'template': {'spec': {'containers': [{'command': ['mpirun',\n",
41+
" 'python',\n",
42+
" 'scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py',\n",
43+
" '--batch_size=32',\n",
44+
" '--model=resnet50',\n",
45+
" '--variable_update=horovod',\n",
46+
" '--use_fp16=true',\n",
47+
" '--xla=true'],\n",
48+
" 'image': 'iguaziodocker/horovod:0.1.1',\n",
49+
" 'name': 'horovod-tf-benchmark',\n",
50+
" 'resources': {'limits': {'nvidia.com/gpu': 1}},\n",
51+
" 'securityContext': {'capabilities': {'add': ['IPC_LOCK']}},\n",
52+
" 'volumeMounts': [{'mountPath': '/User',\n",
53+
" 'name': 'v3io'}]}],\n",
54+
" 'volumes': [{'flexVolume': {'driver': 'v3io/fuse',\n",
55+
" 'options': {'accessKey': '1e52ff93-a541-4880-abf1-d9b948af77de',\n",
56+
" 'container': 'users',\n",
57+
" 'subPath': '/iguazio'}},\n",
58+
" 'name': 'v3io'}]}}}}\n"
59+
]
60+
}
61+
],
62+
"source": [
63+
"from v3io_gputils.mpijob import MpiJob\n",
64+
"\n",
65+
"job = MpiJob(HOROVOD_JOB_NAME, 'iguaziodocker/horovod:0.1.1', ['scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py',\n",
66+
" '--batch_size=32',\n",
67+
" '--model=resnet50',\n",
68+
" '--variable_update=horovod',\n",
69+
" '--use_fp16=true',\n",
70+
" '--xla=true'])\n",
71+
"job.replicas(8).gpus(1)\n",
72+
"job.submit()\n"
73+
]
74+
},
75+
{
76+
"cell_type": "code",
77+
"execution_count": 14,
78+
"metadata": {},
79+
"outputs": [
80+
{
81+
"name": "stdout",
82+
"output_type": "stream",
83+
"text": [
84+
"horovod-tf-benchmark-worker-0 0/1 Pending 0 25s\n",
85+
"horovod-tf-benchmark-worker-1 0/1 Pending 0 25s\n",
86+
"horovod-tf-benchmark-worker-2 0/1 Pending 0 25s\n",
87+
"horovod-tf-benchmark-worker-3 0/1 Pending 0 25s\n",
88+
"horovod-tf-benchmark-worker-4 0/1 Pending 0 25s\n",
89+
"horovod-tf-benchmark-worker-5 0/1 Pending 0 25s\n",
90+
"horovod-tf-benchmark-worker-6 0/1 Pending 0 25s\n",
91+
"horovod-tf-benchmark-worker-7 0/1 Pending 0 25s\n"
92+
]
93+
}
94+
],
95+
"source": [
96+
"!kubectl get pods | grep $HOROVOD_JOB_NAME"
97+
]
98+
},
99+
{
100+
"cell_type": "code",
101+
"execution_count": 6,
102+
"metadata": {},
103+
"outputs": [
104+
{
105+
"name": "stdout",
106+
"output_type": "stream",
107+
"text": [
108+
"apiVersion: kubeflow.org/v1alpha1\n",
109+
"kind: MPIJob\n",
110+
"metadata:\n",
111+
" creationTimestamp: 2019-06-29T14:40:51Z\n",
112+
" generation: 1\n",
113+
" name: horovod-tf-benchmark\n",
114+
" namespace: default-tenant\n",
115+
" resourceVersion: \"1069905\"\n",
116+
" selfLink: /apis/kubeflow.org/v1alpha1/namespaces/default-tenant/mpijobs/horovod-tf-benchmark\n",
117+
" uid: e45f8c83-9a7b-11e9-98d3-d8c4972b0204\n",
118+
"spec:\n",
119+
" backoffLimit: 6\n",
120+
" replicas: 8\n",
121+
" template:\n",
122+
" metadata:\n",
123+
" creationTimestamp: null\n",
124+
" spec:\n",
125+
" containers:\n",
126+
" - command:\n",
127+
" - mpirun\n",
128+
" - python\n",
129+
" - scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py\n",
130+
" - --batch_size=32\n",
131+
" - --model=resnet50\n",
132+
" - --variable_update=horovod\n",
133+
" - --use_fp16=true\n",
134+
" - --xla=true\n",
135+
" image: iguaziodocker/horovod:0.1.1\n",
136+
" name: horovod-tf-benchmark\n",
137+
" resources:\n",
138+
" limits:\n",
139+
" nvidia.com/gpu: \"1\"\n",
140+
" securityContext:\n",
141+
" capabilities:\n",
142+
" add:\n",
143+
" - IPC_LOCK\n",
144+
" volumeMounts:\n",
145+
" - mountPath: /User\n",
146+
" name: v3io\n",
147+
" volumes:\n",
148+
" - flexVolume:\n",
149+
" driver: v3io/fuse\n",
150+
" options:\n",
151+
" accessKey: 1e52ff93-a541-4880-abf1-d9b948af77de\n",
152+
" container: users\n",
153+
" subPath: /iguazio\n",
154+
" name: v3io\n",
155+
"status:\n",
156+
" completionTime: 2019-06-29T14:42:13Z\n",
157+
" launcherStatus: Succeeded\n",
158+
" startTime: 2019-06-29T14:40:59Z\n",
159+
" workerReplicas: 8\n"
160+
]
161+
}
162+
],
163+
"source": [
164+
"!kubectl get mpijob $HOROVOD_JOB_NAME -o yaml"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": 7,
170+
"metadata": {},
171+
"outputs": [
172+
{
173+
"name": "stdout",
174+
"output_type": "stream",
175+
"text": [
176+
"{'apiVersion': 'v1',\n",
177+
" 'details': {'group': 'kubeflow.org',\n",
178+
" 'kind': 'mpijobs',\n",
179+
" 'name': 'horovod-tf-benchmark',\n",
180+
" 'uid': 'e45f8c83-9a7b-11e9-98d3-d8c4972b0204'},\n",
181+
" 'kind': 'Status',\n",
182+
" 'metadata': {},\n",
183+
" 'status': 'Success'}\n"
184+
]
185+
}
186+
],
187+
"source": [
188+
"job.delete()"
189+
]
190+
},
191+
{
192+
"cell_type": "code",
193+
"execution_count": null,
194+
"metadata": {},
195+
"outputs": [],
196+
"source": []
197+
}
198+
],
199+
"metadata": {
200+
"kernelspec": {
201+
"display_name": "Python 3",
202+
"language": "python",
203+
"name": "python3"
204+
},
205+
"language_info": {
206+
"codemirror_mode": {
207+
"name": "ipython",
208+
"version": 3
209+
},
210+
"file_extension": ".py",
211+
"mimetype": "text/x-python",
212+
"name": "python",
213+
"nbconvert_exporter": "python",
214+
"pygments_lexer": "ipython3",
215+
"version": "3.6.8"
216+
}
217+
},
218+
"nbformat": 4,
219+
"nbformat_minor": 2
220+
}

0 commit comments

Comments
 (0)