Skip to content

Commit ae499ec

Browse files
author
jtirana98
committed
ALL DONE
1 parent 167839b commit ae499ec

9 files changed

Lines changed: 99 additions & 105 deletions

File tree

CMakeLists.txt

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,19 +58,19 @@ set(SOURCE_FILES
5858
)
5959

6060
add_executable("${CMAKE_PROJECT_NAME}" ${SOURCE_FILES} main.cpp)
61-
#add_executable(compute_node ${SOURCE_FILES} pipeline_simulation/compute_node.cpp)
62-
#add_executable(data_owner ${SOURCE_FILES} pipeline_simulation/data_owner.cpp)
63-
#add_executable(simulated_data_owner ${SOURCE_FILES} pipeline_simulation/profiling/data_owner_simulated.cpp)
64-
#add_executable(aggregator ${SOURCE_FILES} pipeline_simulation/aggregator.cpp)
61+
add_executable(compute_node ${SOURCE_FILES} pipeline_simulation/compute_node.cpp)
62+
add_executable(data_owner ${SOURCE_FILES} pipeline_simulation/data_owner.cpp)
63+
add_executable(simulated_data_owner ${SOURCE_FILES} pipeline_simulation/profiling/data_owner_simulated.cpp)
64+
add_executable(aggregator ${SOURCE_FILES} pipeline_simulation/aggregator.cpp)
6565

6666
target_link_libraries("${CMAKE_PROJECT_NAME}" ${TORCH_LIBRARIES})
67-
#target_link_libraries(compute_node ${TORCH_LIBRARIES})
68-
#target_link_libraries(data_owner ${TORCH_LIBRARIES})
69-
#target_link_libraries(simulated_data_owner ${TORCH_LIBRARIES})
70-
#target_link_libraries(aggregator ${TORCH_LIBRARIES})
67+
target_link_libraries(compute_node ${TORCH_LIBRARIES})
68+
target_link_libraries(data_owner ${TORCH_LIBRARIES})
69+
target_link_libraries(simulated_data_owner ${TORCH_LIBRARIES})
70+
target_link_libraries(aggregator ${TORCH_LIBRARIES})
7171

7272
set_property(TARGET ${CMAKE_PROJECT_NAME} PROPERTY CXX_STANDARD 17)
73-
#set_property(TARGET compute_node PROPERTY CXX_STANDARD 17)
74-
#set_property(TARGET data_owner PROPERTY CXX_STANDARD 17)
75-
#set_property(TARGET simulated_data_owner PROPERTY CXX_STANDARD 17)
76-
#set_property(TARGET aggregator PROPERTY CXX_STANDARD 17)
73+
set_property(TARGET compute_node PROPERTY CXX_STANDARD 17)
74+
set_property(TARGET data_owner PROPERTY CXX_STANDARD 17)
75+
set_property(TARGET simulated_data_owner PROPERTY CXX_STANDARD 17)
76+
set_property(TARGET aggregator PROPERTY CXX_STANDARD 17)

README.md

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
--------------------------------------------------------------------------------
66

7-
For a more detailed discription of the documentation follow this [link](https://docs.google.com/document/d/1DaWOX27c4_4_VUT-l_UrgUV-zFa8UsIZ5zUv06pgc0s/edit?usp=sharing)
7+
For a more detailed discription of the documentation follow this [link](https://docs.google.com/document/d/1DaWOX27c4_4_VUT-l_UrgUV-zFa8UsIZ5zUv06pgc0s/edit?usp=sharing) or check the wiki.
88

99

1010
Repository structure:
@@ -65,8 +65,46 @@ How to run program and connect Libtorch:
6565

6666
Running SplitPipe in a distributed manner:
6767

68-
- configuring root-table
69-
- enable mulit-task (if applicable)
70-
- parameters for each entity.
71-
- include a figure of the structure.
72-
- emulated version.
68+
*Case 0: Model profiling*
69+
An example code is in main, you can either get the delay for each batch or get the per-layer delay.
70+
71+
*Case 1: Real system*
72+
73+
In this case you will run the data owners as real devices. You can run all entities in one machine or use different devices (within the same network)
74+
75+
- If you cannot use multicast:
76+
- comment the following parts in the code:
77+
- in data_owner.cpp: Comment the findPeers() call and the findInit()
78+
- in compute_node.cpp: Comment the findInit()
79+
- in aggregator.cpp: Comment the findInit()
80+
- in network_layer.cpp: comment the line 506 of versio 1.0.0
81+
- update the rooting table in pipeline_simulation/network_layer.h
82+
83+
For each data owner device call:
84+
85+
$ ./data_owner -i id -d <number-of-data-owners> -c <number-of-compute-nodes> -s <split-rule>
86+
87+
If not an init data owner you just give the node's id
88+
89+
For each compute node device call:
90+
91+
$ ./compute_node -i id
92+
93+
or use script run_cn.sh in pipeline_simulation/profiling
94+
95+
For the aggregator:
96+
97+
$ ./aggregator -i id -d <number-of-data-owners> -c <number-of-compute-nodes>
98+
99+
or use script run_aggr.sh in pipeline_simulation/profiling
100+
101+
NOTE: There is support for logging and checkpoining but this feature is deactivated for this version. You can use the utils/pipeline_logging.sh to do so.
102+
103+
*Case 3: Emulated environment*
104+
105+
In this case the data owners are running in an emulated environmet. Note that this version does not supprt multicast.
106+
You can add in the pipeline_simulation/profiling/rpi_stat.h the device characteristics and use the script run_data_owners_init.sh and run_data_owners_worker.sh in
107+
pipeline_simulation/profiling.
108+
The results are stored to logging files as are indicated in the script files (change them accordingly)
109+
110+
The code for the emulated data owner is in pipeline_simulation/profiling/data_owner_simulatede.cpp

main.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ int main(int argc, char **argv) {
1313
std::vector<int> batches{32, 64, 128};
1414
std::vector<int> splits{2, 4, 6, 8};
1515

16-
// train_resnet(CIFAR_10, resnet101, false, 128, std::vector<int>(), false);
16+
train_resnet(CIFAR_10, resnet101, false, 128, std::vector<int>(), false); //get the latency for one batch update for the whole model
1717

18-
//train_resnet(CIFAR_10, resnet101, true, 128);
19-
train_vgg(CIFAR_10, v19, false, 128, std::vector<int>(), false);
20-
//train_vgg(CIFAR_10, v19, true, 128);
18+
train_resnet(CIFAR_10, resnet101, true, 128); // get the per-layer latency
19+
train_vgg(CIFAR_10, v19, false, 128, std::vector<int>(), false); //get the latency for one batch update for the whole model
20+
train_vgg(CIFAR_10, v19, true, 128); // get the per-layer latency
2121
}

pipeline_simulation/profiling/data_owner_simulated.cpp

Lines changed: 9 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,8 @@ int main(int argc, char **argv) {
2828
refactoring_data client_message;
2929
// check if you are the init
3030
if (myID == 0) {
31-
3231
auto cut_layers_ = "10,19";
33-
//auto data_owners_ = argv[2]; // CHANGE
3432
int num_data_owners = atoi(argv[2]);
35-
//std::cout << data_owners_ << std::endl;
36-
//auto compute_nodes_ = atoi(argv[3]);
3733
int num_compute_nodes = 1;
3834

3935
if(argc >= 4)
@@ -43,7 +39,6 @@ int main(int argc, char **argv) {
4339
cut_layers_ = "3,13,19";
4440
if (num_compute_nodes == 3)
4541
cut_layers_ = "2,15,25,35";//"3,8,14,19";
46-
//if (num_compute_nodes == 4)
4742

4843
const char separator = ',';
4944
std::string val;
@@ -56,13 +51,6 @@ int main(int argc, char **argv) {
5651
}
5752
}
5853

59-
/*streamData = std::stringstream(data_owners_);
60-
while (std::getline(streamData, val, separator)) {
61-
if (val != "") {
62-
data_owners.push_back(stoi(val));
63-
}
64-
}*/
65-
6654
data_owners.push_back(0);
6755
for (int i = 0; i < num_data_owners-1; i++) {
6856
data_owners.push_back(i+3 +1);
@@ -75,16 +63,7 @@ int main(int argc, char **argv) {
7563
}
7664

7765
int num_parts = compute_nodes.size() + 2;
78-
79-
//std::cout << "found them" << std::endl;
80-
//sleep(2);
81-
82-
int data_onwer_end = 2;
83-
int data_owner_beg = 8;
84-
85-
int model_name = 2;
86-
int model_type = 3;
87-
66+
8867
client_message.dataset = CIFAR_10;
8968
client_message.model_name_ = model_name::resnet;//model_name::vgg;
9069
client_message.model_type_ =resnet_model::resnet101;//vgg_model::v19;
@@ -107,24 +86,6 @@ int main(int argc, char **argv) {
10786
my_port = my_port + (data_owners[i] +3);
10887
sys_.my_network_layer.rooting_table.insert({data_owners[i], std::pair<std::string, int>(my_addr.first, my_port)});
10988
}
110-
/*else if(data_owners[i] > 13 && data_owners[i] < 18) {
111-
std::pair<std::string, int> my_addr = sys_.my_network_layer.rooting_table.find(13)->second;
112-
int my_port = my_addr.second;
113-
my_port = my_port + (data_owners[i] - 13);
114-
sys_.my_network_layer.rooting_table.insert({data_owners[i], std::pair<std::string, int>(my_addr.first, my_port)});
115-
}
116-
else if (data_owners[i] > 18 && data_owners[i] < 33){
117-
std::pair<std::string, int> my_addr = sys_.my_network_layer.rooting_table.find(18)->second;
118-
int my_port = my_addr.second;
119-
my_port = my_port + (data_owners[i] - 18);
120-
sys_.my_network_layer.rooting_table.insert({data_owners[i], std::pair<std::string, int>(my_addr.first, my_port)});
121-
}
122-
else if (data_owners[i] > 33 && data_owners[i] < 43) {
123-
std::pair<std::string, int> my_addr = sys_.my_network_layer.rooting_table.find(33)->second;
124-
int my_port = my_addr.second;
125-
my_port = my_port + (data_owners[i] - 33);
126-
sys_.my_network_layer.rooting_table.insert({data_owners[i], std::pair<std::string, int>(my_addr.first, my_port)});
127-
}*/
12889
else if (data_owners[i] >= 18) {
12990
std::pair<std::string, int> my_addr = sys_.my_network_layer.rooting_table.find(18)->second;
13091
int my_port = my_addr.second;
@@ -201,20 +162,16 @@ int main(int argc, char **argv) {
201162
auto send_gradients = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
202163
int epoch_count = 0, g_epoch_count = 0; // communication round
203164
bool new_r = true;
204-
for (size_t round = 0; round != sys_.rounds; ++round) {
165+
for (size_t round = 0; round != sys_.rounds; round++) {
205166
int batch_index = 0;
206167
sys_.zero_metrics();
207168
int total_num = 0;
208169
if (new_r) {
209-
//std::cout << "New round " << std::endl;
210170
init_epoch = std::chrono::steady_clock::now();
211171
new_r = false;
212172
}
213173

214174
send_activations = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch());
215-
//long c = send_activations.count();
216-
//std::cout << c << std::endl;
217-
//std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count() << std::endl;
218175
int g_i = 0;
219176
for (int inter_batch = 0; inter_batch < 4; inter_batch++ ) {
220177
for (auto& batch : *train_dataloader) {
@@ -225,8 +182,7 @@ int main(int argc, char **argv) {
225182
task.size_ = batch.data.size(0);
226183
task.values = batch.data;
227184
task = sys_.exec(task, batch.target);
228-
//task.t_start = send_activations.count();
229-
//std::cout << task.t_start << std::endl;
185+
230186
total_num += task.size_;
231187
task.batch0 = batch_index;
232188
auto end_f1 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
@@ -249,7 +205,7 @@ int main(int argc, char **argv) {
249205
//std::cout << "f1-end: " << end_f1-send_activations.count() << std::endl;
250206
// send task to next node
251207
task.t_start = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
252-
//usleep(myID*200);
208+
253209
std::cout << "Send forward task to C1 " << end_f1-send_activations.count() << std::endl;
254210
sys_.my_network_layer.new_message(task, sys_.inference_path[0]);
255211

@@ -261,7 +217,7 @@ int main(int argc, char **argv) {
261217
task = sys_.exec(task, batch.target); // forward and backward
262218
// send task - backward
263219
auto end_m2 = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
264-
//std::cout << "m2-end: " << end_m2-send_gradients.count() << std::endl;
220+
265221

266222
real_duration = 0;
267223
real_duration = my_rpi.rpi_fbm2;
@@ -274,13 +230,13 @@ int main(int argc, char **argv) {
274230
usleep(real_duration-(end_m2-send_gradients.count()));
275231
}
276232

277-
//task.t_start = send_gradients.count();
233+
278234
task.t_start = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
279-
//usleep(myID*110);
235+
280236
std::cout << "Send backprop task to C1 " << end_f1-send_activations.count() << std::endl;
281237
sys_.my_network_layer.new_message(task, sys_.inference_path[1]);
282-
//optimize task
283238

239+
//optimize task
284240
auto task1 = sys_.my_network_layer.check_new_task();
285241

286242
task1 = sys_.exec(task1, batch.target); // optimize
@@ -297,8 +253,7 @@ int main(int argc, char **argv) {
297253

298254
//if (g_i % 50 == 0)
299255
std::cout << "One batch: global epoch " << g_epoch_count+1 << " local epoch: " << epoch_count+1 <<" b: " << batch_index+1 << " is " << _time << std::endl;
300-
301-
// end of batch
256+
302257
batch_index++;
303258
g_i++;
304259
}

pipeline_simulation/profiling/run_data_owners.sh renamed to pipeline_simulation/profiling/run_data_owners_init.sh

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,14 @@ mkdir -p /root/experiments/simulations_check/compute_nodes_$2/dataowners_$1_
1111
declare -i start=0
1212
declare -i end=0
1313
declare -i port=0
14-
start=$(( $2 + 1 ))
15-
end=$(( $1-2 + $start ))
14+
declare -i port_start=3
15+
start=$(( 18 ))
16+
end=$(( $1 + $start -1))
1617

1718
for i in $(seq $start 1 $end)
1819
do
19-
port=$(( 8081 + $2 + $i ))
20+
port=$(( 8081 + $port_start ))
2021
sudo iptables -I INPUT -p tcp -m tcp --dport $port -j ACCEPT
2122
../../build/simulated_data_owner $i > "/root/experiments/simulations_check/compute_nodes_$2/dataowners_$1_/d$i.data" &
22-
done
23-
24-
sudo iptables -I INPUT -p tcp -m tcp --dport 8081 -j ACCEPT
25-
../../build/simulated_data_owner 0 $1 $2 > "/root/experiments/simulations_check/compute_nodes_$2/dataowners_$1_/d0.data" &
23+
port_start=$(( $port_start + 1 ))
24+
done
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash -xe
2+
3+
# 1: num of data owners
4+
# 2: num of compute nodes
5+
6+
export LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64
7+
8+
mkdir -p /root/experiments/simulations_check/compute_nodes_$2
9+
mkdir -p /root/experiments/simulations_check/compute_nodes_$2/dataowners_$1_
10+
11+
declare -i start=0
12+
declare -i end=0
13+
declare -i port=0
14+
declare -i port_start=0
15+
start=$(( 18 ))
16+
end=$(( $1 + $start -1))
17+
18+
for i in $(seq $start 1 $end)
19+
do
20+
port=$(( 8081 + $port_start ))
21+
sudo iptables -I INPUT -p tcp -m tcp --dport $port -j ACCEPT
22+
../../build/simulated_data_owner $i > "/root/experiments/simulations_check/compute_nodes_$2/dataowners_$1_/d$i.data" &
23+
port_start=$(( $port_start + 1 ))
24+
done

pipeline_simulation/profiling/run_exper.sh

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)