Skip to content

Commit d7e511f

Browse files
authored
feat(ci)(1/N): Introduce integration tests with uniffle + spark (#531)
1 parent 2aaa8a5 commit d7e511f

File tree

8 files changed

+368
-0
lines changed

8 files changed

+368
-0
lines changed
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
name: Spark SQL Integration Test
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
pull_request:
8+
workflow_dispatch:
9+
10+
concurrency:
11+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
12+
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
13+
14+
jobs:
15+
spark-sql-integration-test:
16+
runs-on: ubuntu-22.04
17+
timeout-minutes: 60
18+
19+
steps:
20+
- name: Remove unnecessary files
21+
run: |
22+
sudo rm -rf /usr/share/dotnet
23+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
24+
25+
- name: Checkout code
26+
uses: actions/checkout@v4
27+
28+
- name: Run Spark SQL Test
29+
run: docker compose -f dev/integration/docker-compose.yml up

dev/integration/Dockerfile

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
FROM centos:7
2+
3+
# install common tools
4+
RUN echo "sslverify=false" >> /etc/yum.conf
5+
RUN sed -i "s/mirror.centos.org/vault.centos.org/g" /etc/yum.repos.d/*.repo
6+
RUN sed -i "s/^#.*baseurl=http/baseurl=https/g" /etc/yum.repos.d/*.repo
7+
RUN sed -i "s/^mirrorlist/#mirrorlist/g" /etc/yum.repos.d/*.repo
8+
RUN yum update -y
9+
RUN yum install -y centos-release-scl epel-release
10+
RUN sed -i "s/mirror.centos.org/vault.centos.org/g" /etc/yum.repos.d/*.repo
11+
RUN sed -i "s/^#.*baseurl=http/baseurl=https/g" /etc/yum.repos.d/*.repo
12+
RUN sed -i "s/^mirrorlist/#mirrorlist/g" /etc/yum.repos.d/*.repo
13+
RUN yum install -y libzip unzip wget cmake3 openssl-devel llvm clang-devel clang krb5-workstation clang-devel git gcc
14+
15+
# install gcc-11
16+
RUN yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++
17+
RUN echo '. /opt/rh/devtoolset-11/enable' >> ~/.bashrc
18+
19+
RUN yum install -y llvm-toolset-7
20+
RUN echo '. /opt/rh/llvm-toolset-7/enable' >> ~/.bashrc
21+
22+
# install rust nightly toolchain
23+
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly-2025-06-01
24+
ENV PATH="/root/.cargo/bin:${PATH}"
25+
RUN rustc --version
26+
27+
# install java
28+
RUN yum install -y java-1.8.0-openjdk java-1.8.0-openjdk-devel
29+
ENV JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk"
30+
RUN echo "export JAVA_HOME=${JAVA_HOME}" >> ~/.bashrc
31+
32+
# install maven
33+
# RUN yum install -y rh-maven35
34+
# RUN echo 'source /opt/rh/rh-maven35/enable' >> ~/.bashrc
35+
36+
# install protoc
37+
RUN wget -O /protobuf-21.7-linux-x86_64.zip https://github.com/protocolbuffers/protobuf/releases/download/v21.7/protoc-21.7-linux-x86_64.zip
38+
RUN mkdir /protobuf-bin && (cd /protobuf-bin && unzip /protobuf-21.7-linux-x86_64.zip)
39+
ENV PATH="/protobuf-bin/bin:${PATH}"
40+
RUN echo 'export PATH="/protobuf-bin/bin:$PATH"' >> ~/.bashrc
41+
42+
# attach libjvm.so
43+
RUN echo 'export LD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${LD_LIBRARY_PATH}' >> ~/.bashrc
44+
45+
# setup hadoop env
46+
RUN curl -LsSf https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz | tar zxf - -C /root
47+
ENV HADOOP_HOME=/root/hadoop-3.3.5
48+
RUN echo "export HADOOP_HOME=${HADOOP_HOME}" >> ~/.bashrc
49+
RUN echo "export CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath --glob)" >> ~/.bashrc
50+
RUN echo "export HDRS_NAMENODE=default" >> ~/.bashrc
51+
RUN echo "export HDRS_WORKDIR=/tmp/hdrs/" >> ~/.bashrc
52+
53+
# install python and pip
54+
RUN yum install -y python3 python3-pip
55+
RUN pip3 install --upgrade pip
56+
57+
# install netcat for port checking
58+
RUN yum install -y nc
59+
60+
# install spark
61+
ARG SPARK_VERSION=3.5.0
62+
RUN cd /opt && \
63+
wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
64+
tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
65+
mv spark-${SPARK_VERSION}-bin-hadoop3 spark && \
66+
rm spark-${SPARK_VERSION}-bin-hadoop3.tgz
67+
ENV SPARK_HOME=/opt/spark
68+
ENV PATH="${SPARK_HOME}/bin:${PATH}"
69+
70+
# install uniffle coordinator (must be before configuring it)
71+
ARG UNIFFLE_VERSION=0.10.0
72+
RUN cd /opt && \
73+
wget -q https://archive.apache.org/dist/uniffle/${UNIFFLE_VERSION}/apache-uniffle-${UNIFFLE_VERSION}-bin.tar.gz && \
74+
tar zxvf apache-uniffle-${UNIFFLE_VERSION}-bin.tar.gz && mv apache-uniffle-${UNIFFLE_VERSION}-hadoop2.8 uniffle && \
75+
rm -f apache-uniffle-${UNIFFLE_VERSION}-bin.tar.gz
76+
ENV UNIFFLE_HOME=/opt/uniffle
77+
ENV PATH="${UNIFFLE_HOME}/bin:${PATH}"
78+
79+
# configure uniffle coordinator (after UNIFFLE_HOME is set and uniffle is installed)
80+
RUN mkdir -p ${UNIFFLE_HOME}/conf
81+
COPY coordinator.conf ${UNIFFLE_HOME}/conf/coordinator.conf
82+
83+
# download uniffle spark client
84+
RUN mkdir -p ${SPARK_HOME}/jars && \
85+
wget -q https://repo1.maven.org/maven2/org/apache/uniffle/rss-client-spark3-shaded/${UNIFFLE_VERSION}/rss-client-spark3-shaded-${UNIFFLE_VERSION}.jar \
86+
-O ${SPARK_HOME}/jars/rss-client.jar || \
87+
echo "Failed to download Uniffle Spark Client"
88+
89+
# setup riffle home directory
90+
ENV RIFFLE_HOME=/opt/riffle
91+
RUN mkdir -p ${RIFFLE_HOME}/conf
92+
93+
# configure riffle servers (place configs in RIFFLE_HOME/conf)
94+
COPY riffle.conf.1 ${RIFFLE_HOME}/conf/riffle.conf.1
95+
COPY riffle.conf.2 ${RIFFLE_HOME}/conf/riffle.conf.2
96+
97+
# configure spark to use riffle
98+
COPY spark-defaults.conf ${SPARK_HOME}/conf/spark-defaults.conf
99+
100+
# copy endpoint script
101+
COPY endpoint.sh /usr/local/bin/endpoint.sh
102+
RUN chmod +x /usr/local/bin/endpoint.sh
103+
104+
# setup environment variables
105+
RUN echo "export SPARK_HOME=${SPARK_HOME}" >> ~/.bashrc && \
106+
echo "export UNIFFLE_HOME=${UNIFFLE_HOME}" >> ~/.bashrc && \
107+
echo "export RIFFLE_HOME=${RIFFLE_HOME}" >> ~/.bashrc && \
108+
echo "export PYTHONPATH=\${SPARK_HOME}/python:\${SPARK_HOME}/python/lib/py4j-*.zip:\${PYTHONPATH}" >> ~/.bashrc
109+
110+
RUN echo "export RUST_BACKTRACE=1" >> ~/.bashrc
111+
112+
# Set entrypoint (no WORKDIR needed as endpoint.sh handles paths)
113+
ENTRYPOINT ["/usr/local/bin/endpoint.sh"]
114+
115+
ENV LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH
116+
RUN echo "$HADOOP_HOME/lib/native" >> /etc/ld.so.conf.d/hadoop.conf && ldconfig

dev/integration/coordinator.conf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
rss.coordinator.server.heartbeat.timeout=30000
2+
rss.coordinator.app.expired=60000
3+
rss.coordinator.server.heartbeat.interval=10000
4+
rss.rpc.server.port=21000
5+
rss.jetty.http.port=19995
6+

dev/integration/docker-compose.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
version: "3"
2+
3+
services:
4+
spark-sql-test:
5+
build:
6+
context: .
7+
volumes:
8+
- ~/.cargo/git:/root/.cargo/git:rw
9+
- ~/.cargo/registry:/root/.cargo/registry:rw
10+
- ./../../:/riffle:rw
11+
- ./../../target-docker:/riffle/target:rw
12+
# endpoint.sh is set as ENTRYPOINT in Dockerfile, so no command needed

dev/integration/endpoint.sh

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Source environment variables
5+
source ~/.bashrc
6+
7+
# Colors for output
8+
RED='\033[0;31m'
9+
GREEN='\033[0;32m'
10+
YELLOW='\033[1;33m'
11+
NC='\033[0m' # No Color
12+
13+
echo_info() {
14+
echo -e "${GREEN}[INFO]${NC} $1"
15+
}
16+
17+
echo_warn() {
18+
echo -e "${YELLOW}[WARN]${NC} $1"
19+
}
20+
21+
echo_error() {
22+
echo -e "${RED}[ERROR]${NC} $1"
23+
}
24+
25+
# Create necessary directories
26+
echo_info "Creating necessary directories..."
27+
mkdir -p /tmp/riffle-server-1/data
28+
mkdir -p /tmp/riffle-server-2/data
29+
mkdir -p ${UNIFFLE_HOME}/logs
30+
31+
# Start Uniffle Coordinator
32+
echo_info "Starting Uniffle Coordinator..."
33+
cd ${UNIFFLE_HOME}
34+
35+
# Check if coordinator is already running
36+
if nc -z localhost 21000 2>/dev/null; then
37+
echo_warn "Coordinator already running on port 21000"
38+
else
39+
echo_info "Starting coordinator..."
40+
nohup ./bin/start-coordinator.sh > logs/coordinator.log 2>&1 &
41+
COORDINATOR_PID=$!
42+
echo $COORDINATOR_PID > /tmp/uniffle-coordinator.pid
43+
echo_info "Coordinator started with PID: $COORDINATOR_PID"
44+
45+
# Wait for coordinator to be ready
46+
echo_info "Waiting for coordinator to be ready..."
47+
for i in {1..30}; do
48+
if curl -f http://localhost:19995/api/app/total 2>/dev/null || nc -z localhost 21000 2>/dev/null; then
49+
echo_info "Coordinator is ready!"
50+
break
51+
fi
52+
if [ $i -eq 30 ]; then
53+
echo_error "Coordinator failed to start within timeout"
54+
echo_error "=== Coordinator logs (last 20 lines) ==="
55+
tail -20 logs/coordinator.log 2>/dev/null || echo "No log file found"
56+
exit 1
57+
fi
58+
echo "Waiting for coordinator... ($i/30)"
59+
sleep 2
60+
done
61+
fi
62+
63+
# Build Riffle Server if not exists
64+
if [ ! -f /riffle/target/release/riffle-server ]; then
65+
echo_info "Building Riffle Server..."
66+
cd /riffle
67+
cargo build --release
68+
fi
69+
70+
WORKER_IP=127.0.0.1
71+
72+
# Start Riffle Server 1
73+
echo_info "Starting Riffle Server 1..."
74+
cd /tmp/riffle-server-1
75+
cp ${RIFFLE_HOME}/conf/riffle.conf.1 config.toml
76+
RUST_LOG=info nohup /riffle/target/release/riffle-server --config config.toml > server1.log 2>&1 &
77+
RIFFLE_SERVER_1_PID=$!
78+
echo $RIFFLE_SERVER_1_PID > /tmp/riffle-server-1.pid
79+
echo_info "Riffle Server 1 started with PID: $RIFFLE_SERVER_1_PID"
80+
sleep 5
81+
82+
# Start Riffle Server 2
83+
echo_info "Starting Riffle Server 2..."
84+
cd /tmp/riffle-server-2
85+
cp ${RIFFLE_HOME}/conf/riffle.conf.2 config.toml
86+
RUST_LOG=info nohup /riffle/target/release/riffle-server --config config.toml > server2.log 2>&1 &
87+
RIFFLE_SERVER_2_PID=$!
88+
echo $RIFFLE_SERVER_2_PID > /tmp/riffle-server-2.pid
89+
echo_info "Riffle Server 2 started with PID: $RIFFLE_SERVER_2_PID"
90+
sleep 5
91+
92+
# Verify Riffle Servers are running
93+
echo_info "Verifying Riffle Servers..."
94+
sleep 3
95+
if curl -f http://localhost:19998/metrics >/dev/null 2>&1; then
96+
echo_info "Riffle Server 1 is running"
97+
else
98+
echo_warn "Riffle Server 1 metrics not ready"
99+
echo_warn "=== Riffle Server 1 log (last 40 lines) ==="
100+
tail -40 /tmp/riffle-server-1/server1.log 2>/dev/null || echo_warn "No log file found"
101+
exit 1
102+
fi
103+
104+
if curl -f http://localhost:19999/metrics >/dev/null 2>&1; then
105+
echo_info "Riffle Server 2 is running"
106+
else
107+
echo_warn "Riffle Server 2 metrics not ready"
108+
fi
109+
110+
# Create Spark SQL test script
111+
echo_info "Creating Spark SQL test script..."
112+
cat > /tmp/spark_basic.scala << 'EOF'
113+
val data = sc.parallelize(1 to 100, 4)
114+
val pairs = data.map(x => (x % 5, x))
115+
val grouped = pairs.groupByKey()
116+
val result = grouped.mapValues(_.sum).collect().sortBy(_._1)
117+
result.foreach(println)
118+
EOF
119+
120+
# Run Spark SQL Integration Test
121+
echo_info "Running Spark SQL Integration Test..."
122+
cd ${SPARK_HOME}
123+
124+
# Run spark-submit
125+
if ./bin/spark-shell \
126+
--master local[1] \
127+
-i /tmp/spark_basic.scala; then
128+
echo_info "Spark SQL test completed successfully!"
129+
TEST_RESULT=0
130+
else
131+
echo_error "Spark SQL test failed!"
132+
TEST_RESULT=1
133+
fi
134+
135+
exit $TEST_RESULT

dev/integration/riffle.conf.1

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
store_type = "MEMORY_LOCALFILE"
2+
grpc_port = 21100
3+
http_port = 19998
4+
coordinator_quorum = ["localhost:21000"]
5+
tags = ["riffle2", "datanode", "GRPC", "ss_v5"]
6+
7+
[memory_store]
8+
capacity = "2G"
9+
dashmap_shard_amount = 128
10+
11+
[localfile_store]
12+
data_paths = ["/tmp/riffle-server-1/data"]
13+
healthy_check_min_disks = 0
14+
disk_max_concurrency = 2000
15+
16+
[hybrid_store]
17+
memory_spill_high_watermark = 0.5
18+
memory_spill_low_watermark = 0.2
19+
memory_spill_max_concurrency = 1000
20+
21+
[runtime_config]
22+
read_thread_num = 20
23+
write_thread_num = 50
24+
grpc_thread_num = 50
25+
http_thread_num = 5
26+
default_thread_num = 10
27+
dispatch_thread_num = 5
28+

dev/integration/riffle.conf.2

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
store_type = "MEMORY_LOCALFILE"
2+
grpc_port = 21101
3+
http_port = 19999
4+
coordinator_quorum = ["localhost:21000"]
5+
tags = ["riffle2", "datanode", "GRPC", "ss_v5"]
6+
7+
[memory_store]
8+
capacity = "2G"
9+
dashmap_shard_amount = 128
10+
11+
[localfile_store]
12+
data_paths = ["/tmp/riffle-server-2/data"]
13+
healthy_check_min_disks = 0
14+
disk_max_concurrency = 2000
15+
16+
[hybrid_store]
17+
memory_spill_high_watermark = 0.5
18+
memory_spill_low_watermark = 0.2
19+
memory_spill_max_concurrency = 1000
20+
21+
[runtime_config]
22+
read_thread_num = 20
23+
write_thread_num = 50
24+
grpc_thread_num = 50
25+
http_thread_num = 5
26+
default_thread_num = 10
27+
dispatch_thread_num = 5
28+
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
spark.shuffle.manager org.apache.spark.shuffle.RssShuffleManager
2+
spark.rss.coordinator.quorum localhost:21000
3+
spark.rss.storage.type MEMORY_LOCALFILE
4+
spark.executor.instances 2
5+
spark.executor.cores 2
6+
spark.executor.memory 2g
7+
spark.sql.shuffle.partitions 4
8+
9+
spark.serializer = org.apache.spark.serializer.KryoSerializer
10+
11+
spark.rss.dynamicClientConf.enabled = false
12+
spark.rss.client.type = GRPC
13+
14+
spark.rss.client.assignment.shuffle.nodes.max = 1

0 commit comments

Comments
 (0)