-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
137 lines (114 loc) · 4.71 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Makefile for Spark Sparse Product project.
# Customize these paths for your environment.
# -----------------------------------------------------------
spark.root=/Users/landonneal/jacksonn/CS6240/spark-3.1.2-bin-without-hadoop
hadoop.root=/Users/landonneal/jacksonn/CS6240/hadoop-3.3.0
app.name=Sparse Product
jar.name=spark-sparse-product.jar
maven.jar.name=spark-sparse-product-1.0.jar
job.name=product.SparseProduct
local.master=local[4]
local.input.a=input/a
local.input.b=input/b
local.output=output
# Pseudo-Cluster Execution
hdfs.user.name=landonneal
hdfs.input=input
hdfs.output=output
# AWS EMR Execution
aws.emr.release=emr-6.2.0
aws.bucket.name=spark-sparse-product-jn
aws.input.a=input/a
aws.input.b=input/b
aws.output=output
aws.log.dir=log
aws.num.nodes=5
aws.instance.type=m5.xlarge
# -----------------------------------------------------------
# Compiles code and builds jar (with dependencies).
jar:
mvn clean package
cp target/${maven.jar.name} ${jar.name}
# Removes local output directory.
clean-local-output:
rm -rf ${local.output}*
# Runs standalone
local: jar clean-local-output
spark-submit --class ${job.name} --master ${local.master} --name "${app.name}" ${jar.name} ${local.input.a} ${local.input.b} ${local.output}
# Start HDFS
start-hdfs:
${hadoop.root}/sbin/start-dfs.sh
# Stop HDFS
stop-hdfs:
${hadoop.root}/sbin/stop-dfs.sh
# Start YARN
start-yarn: stop-yarn
${hadoop.root}/sbin/start-yarn.sh
# Stop YARN
stop-yarn:
${hadoop.root}/sbin/stop-yarn.sh
# Reformats & initializes HDFS.
format-hdfs: stop-hdfs
rm -rf /tmp/hadoop*
${hadoop.root}/bin/hdfs namenode -format
# Initializes user & input directories of HDFS.
init-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -rm -r -f /user
${hadoop.root}/bin/hdfs dfs -mkdir /user
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}/${hdfs.input}
# Load data to HDFS
upload-input-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -put ${local.input}/* /user/${hdfs.user.name}/${hdfs.input}
# Removes hdfs output directory.
clean-hdfs-output:
${hadoop.root}/bin/hdfs dfs -rm -r -f ${hdfs.output}*
# Download output from HDFS to local.
download-output-hdfs:
mkdir ${local.output}
${hadoop.root}/bin/hdfs dfs -get ${hdfs.output}/* ${local.output}
# Runs pseudo-clustered (ALL). ONLY RUN THIS ONCE, THEN USE: make pseudoq
# Make sure Hadoop is set up (in /etc/hadoop files) for pseudo-clustered operation (not standalone).
# https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation
pseudo: jar stop-yarn format-hdfs init-hdfs upload-input-hdfs start-yarn clean-local-output
spark-submit --class ${job.name} --master yarn --deploy-mode cluster ${jar.name} ${local.input} ${local.output}
make download-output-hdfs
# Runs pseudo-clustered (quickie).
pseudoq: jar clean-local-output clean-hdfs-output
spark-submit --class ${job.name} --master yarn --deploy-mode cluster ${jar.name} ${local.input} ${local.output}
make download-output-hdfs
# Create S3 bucket.
make-bucket:
aws s3 mb s3://${aws.bucket.name}
# Upload data to S3 input dir.
upload-input-aws: make-bucket
aws s3 sync ${local.input.a} s3://${aws.bucket.name}/${aws.input.a}
aws s3 sync ${local.input.b} s3://${aws.bucket.name}/${aws.input.b}
# Delete S3 output dir.
delete-output-aws:
aws s3 rm s3://${aws.bucket.name}/ --recursive --exclude "*" --include "${aws.output}*"
# Upload application to S3 bucket.
upload-app-aws:
aws s3 cp ${jar.name} s3://${aws.bucket.name}
# Main EMR launch.
aws: jar upload-app-aws delete-output-aws
aws emr create-cluster \
--name "Sparse Product Spark Cluster" \
--release-label ${aws.emr.release} \
--instance-groups '[{"InstanceCount":${aws.num.nodes},"InstanceGroupType":"CORE","InstanceType":"${aws.instance.type}"},{"InstanceCount":1,"InstanceGroupType":"MASTER","InstanceType":"${aws.instance.type}"}]' \
--applications Name=Hadoop Name=Spark \
--steps Type=CUSTOM_JAR,Name="${app.name}",Jar="command-runner.jar",ActionOnFailure=TERMINATE_CLUSTER,Args=["spark-submit","--deploy-mode","cluster","--class","${job.name}","s3://${aws.bucket.name}/${jar.name}","s3://${aws.bucket.name}/${aws.input.a}","s3://${aws.bucket.name}/${aws.input.b}","s3://${aws.bucket.name}/${aws.output}"] \
--log-uri s3://${aws.bucket.name}/${aws.log.dir} \
--use-default-roles \
--enable-debugging \
--auto-terminate
# Download output from S3.
download-output-aws: clean-local-output
mkdir ${local.output}
aws s3 sync s3://${aws.bucket.name}/${aws.output} ${local.output}
# Change to standalone mode.
switch-standalone:
cp config/standalone/*.xml ${hadoop.root}/etc/hadoop
# Change to pseudo-cluster mode.
switch-pseudo:
cp config/pseudo/*.xml ${hadoop.root}/etc/hadoop