feat(ci)(1/N): Introduce integration tests with uniffle + spark (#531)

zuston · web-flow · commit d7e511f39c1d · 2025-12-02T14:04:18.000+08:00
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -0,0 +1,29 @@
+name: Spark SQL Integration Test
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  spark-sql-integration-test:
+    runs-on: ubuntu-22.04
+    timeout-minutes: 60
+
+    steps:
+      - name: Remove unnecessary files
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Run Spark SQL Test
+        run: docker compose -f dev/integration/docker-compose.yml up
diff --git a/dev/integration/Dockerfile b/dev/integration/Dockerfile
@@ -0,0 +1,116 @@
+FROM centos:7
+
+# install common tools
+RUN echo "sslverify=false" >> /etc/yum.conf
+RUN sed -i "s/mirror.centos.org/vault.centos.org/g" /etc/yum.repos.d/*.repo
+RUN sed -i "s/^#.*baseurl=http/baseurl=https/g" /etc/yum.repos.d/*.repo
+RUN sed -i "s/^mirrorlist/#mirrorlist/g" /etc/yum.repos.d/*.repo
+RUN yum update -y
+RUN yum install -y centos-release-scl epel-release
+RUN sed -i "s/mirror.centos.org/vault.centos.org/g" /etc/yum.repos.d/*.repo
+RUN sed -i "s/^#.*baseurl=http/baseurl=https/g" /etc/yum.repos.d/*.repo
+RUN sed -i "s/^mirrorlist/#mirrorlist/g" /etc/yum.repos.d/*.repo
+RUN yum install -y libzip unzip wget cmake3 openssl-devel llvm clang-devel clang krb5-workstation clang-devel git gcc
+
+# install gcc-11
+RUN yum install -y devtoolset-11-gcc devtoolset-11-gcc-c++
+RUN echo '. /opt/rh/devtoolset-11/enable' >> ~/.bashrc
+
+RUN yum install -y llvm-toolset-7
+RUN echo '. /opt/rh/llvm-toolset-7/enable' >> ~/.bashrc
+
+# install rust nightly toolchain
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain nightly-2025-06-01
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN rustc --version
+
+# install java
+RUN yum install -y java-1.8.0-openjdk java-1.8.0-openjdk-devel
+ENV JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk"
+RUN echo "export JAVA_HOME=${JAVA_HOME}" >> ~/.bashrc
+
+# install maven
+# RUN yum install -y rh-maven35
+# RUN echo 'source /opt/rh/rh-maven35/enable' >> ~/.bashrc
+
+# install protoc
+RUN wget -O /protobuf-21.7-linux-x86_64.zip https://github.com/protocolbuffers/protobuf/releases/download/v21.7/protoc-21.7-linux-x86_64.zip
+RUN mkdir /protobuf-bin && (cd /protobuf-bin && unzip /protobuf-21.7-linux-x86_64.zip)
+ENV PATH="/protobuf-bin/bin:${PATH}"
+RUN echo 'export PATH="/protobuf-bin/bin:$PATH"' >> ~/.bashrc
+
+# attach libjvm.so
+RUN echo 'export LD_LIBRARY_PATH=${JAVA_HOME}/jre/lib/amd64/server:${LD_LIBRARY_PATH}' >> ~/.bashrc
+
+# setup hadoop env
+RUN curl -LsSf https://dlcdn.apache.org/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz | tar zxf - -C /root
+ENV HADOOP_HOME=/root/hadoop-3.3.5
+RUN echo "export HADOOP_HOME=${HADOOP_HOME}" >> ~/.bashrc
+RUN echo "export CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath --glob)" >> ~/.bashrc
+RUN echo "export HDRS_NAMENODE=default" >> ~/.bashrc
+RUN echo "export HDRS_WORKDIR=/tmp/hdrs/" >> ~/.bashrc
+
+# install python and pip
+RUN yum install -y python3 python3-pip
+RUN pip3 install --upgrade pip
+
+# install netcat for port checking
+RUN yum install -y nc
+
+# install spark
+ARG SPARK_VERSION=3.5.0
+RUN cd /opt && \
+    wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
+    tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
+    mv spark-${SPARK_VERSION}-bin-hadoop3 spark && \
+    rm spark-${SPARK_VERSION}-bin-hadoop3.tgz
+ENV SPARK_HOME=/opt/spark
+ENV PATH="${SPARK_HOME}/bin:${PATH}"
+
+# install uniffle coordinator (must be before configuring it)
+ARG UNIFFLE_VERSION=0.10.0
+RUN cd /opt && \
+    wget -q https://archive.apache.org/dist/uniffle/${UNIFFLE_VERSION}/apache-uniffle-${UNIFFLE_VERSION}-bin.tar.gz && \
+    tar zxvf apache-uniffle-${UNIFFLE_VERSION}-bin.tar.gz && mv apache-uniffle-${UNIFFLE_VERSION}-hadoop2.8 uniffle && \
+    rm -f apache-uniffle-${UNIFFLE_VERSION}-bin.tar.gz
+ENV UNIFFLE_HOME=/opt/uniffle
+ENV PATH="${UNIFFLE_HOME}/bin:${PATH}"
+
+# configure uniffle coordinator (after UNIFFLE_HOME is set and uniffle is installed)
+RUN mkdir -p ${UNIFFLE_HOME}/conf
+COPY coordinator.conf ${UNIFFLE_HOME}/conf/coordinator.conf
+
+# download uniffle spark client
+RUN mkdir -p ${SPARK_HOME}/jars && \
+    wget -q https://repo1.maven.org/maven2/org/apache/uniffle/rss-client-spark3-shaded/${UNIFFLE_VERSION}/rss-client-spark3-shaded-${UNIFFLE_VERSION}.jar \
+    -O ${SPARK_HOME}/jars/rss-client.jar || \
+    echo "Failed to download Uniffle Spark Client"
+
+# setup riffle home directory
+ENV RIFFLE_HOME=/opt/riffle
+RUN mkdir -p ${RIFFLE_HOME}/conf
+
+# configure riffle servers (place configs in RIFFLE_HOME/conf)
+COPY riffle.conf.1 ${RIFFLE_HOME}/conf/riffle.conf.1
+COPY riffle.conf.2 ${RIFFLE_HOME}/conf/riffle.conf.2
+
+# configure spark to use riffle
+COPY spark-defaults.conf ${SPARK_HOME}/conf/spark-defaults.conf
+
+# copy endpoint script
+COPY endpoint.sh /usr/local/bin/endpoint.sh
+RUN chmod +x /usr/local/bin/endpoint.sh
+
+# setup environment variables
+RUN echo "export SPARK_HOME=${SPARK_HOME}" >> ~/.bashrc && \
+    echo "export UNIFFLE_HOME=${UNIFFLE_HOME}" >> ~/.bashrc && \
+    echo "export RIFFLE_HOME=${RIFFLE_HOME}" >> ~/.bashrc && \
+    echo "export PYTHONPATH=\${SPARK_HOME}/python:\${SPARK_HOME}/python/lib/py4j-*.zip:\${PYTHONPATH}" >> ~/.bashrc
+
+RUN echo "export RUST_BACKTRACE=1" >> ~/.bashrc
+
+# Set entrypoint (no WORKDIR needed as endpoint.sh handles paths)
+ENTRYPOINT ["/usr/local/bin/endpoint.sh"]
+
+ENV LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH
+RUN echo "$HADOOP_HOME/lib/native" >> /etc/ld.so.conf.d/hadoop.conf && ldconfig
diff --git a/dev/integration/coordinator.conf b/dev/integration/coordinator.conf
@@ -0,0 +1,6 @@
+rss.coordinator.server.heartbeat.timeout=30000
+rss.coordinator.app.expired=60000
+rss.coordinator.server.heartbeat.interval=10000
+rss.rpc.server.port=21000
+rss.jetty.http.port=19995
+
diff --git a/dev/integration/docker-compose.yml b/dev/integration/docker-compose.yml
@@ -0,0 +1,12 @@
+version: "3"
+
+services:
+  spark-sql-test:
+    build:
+      context: .
+    volumes:
+      - ~/.cargo/git:/root/.cargo/git:rw
+      - ~/.cargo/registry:/root/.cargo/registry:rw
+      - ./../../:/riffle:rw
+      - ./../../target-docker:/riffle/target:rw
+    # endpoint.sh is set as ENTRYPOINT in Dockerfile, so no command needed
diff --git a/dev/integration/endpoint.sh b/dev/integration/endpoint.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+set -e
+
+# Source environment variables
+source ~/.bashrc
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+echo_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+echo_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Create necessary directories
+echo_info "Creating necessary directories..."
+mkdir -p /tmp/riffle-server-1/data
+mkdir -p /tmp/riffle-server-2/data
+mkdir -p ${UNIFFLE_HOME}/logs
+
+# Start Uniffle Coordinator
+echo_info "Starting Uniffle Coordinator..."
+cd ${UNIFFLE_HOME}
+
+# Check if coordinator is already running
+if nc -z localhost 21000 2>/dev/null; then
+    echo_warn "Coordinator already running on port 21000"
+else
+    echo_info "Starting coordinator..."
+    nohup ./bin/start-coordinator.sh > logs/coordinator.log 2>&1 &
+    COORDINATOR_PID=$!
+    echo $COORDINATOR_PID > /tmp/uniffle-coordinator.pid
+    echo_info "Coordinator started with PID: $COORDINATOR_PID"
+
+    # Wait for coordinator to be ready
+    echo_info "Waiting for coordinator to be ready..."
+    for i in {1..30}; do
+        if curl -f http://localhost:19995/api/app/total 2>/dev/null || nc -z localhost 21000 2>/dev/null; then
+            echo_info "Coordinator is ready!"
+            break
+        fi
+        if [ $i -eq 30 ]; then
+            echo_error "Coordinator failed to start within timeout"
+            echo_error "=== Coordinator logs (last 20 lines) ==="
+            tail -20 logs/coordinator.log 2>/dev/null || echo "No log file found"
+            exit 1
+        fi
+        echo "Waiting for coordinator... ($i/30)"
+        sleep 2
+    done
+fi
+
+# Build Riffle Server if not exists
+if [ ! -f /riffle/target/release/riffle-server ]; then
+    echo_info "Building Riffle Server..."
+    cd /riffle
+    cargo build --release
+fi
+
+WORKER_IP=127.0.0.1
+
+# Start Riffle Server 1
+echo_info "Starting Riffle Server 1..."
+cd /tmp/riffle-server-1
+cp ${RIFFLE_HOME}/conf/riffle.conf.1 config.toml
+RUST_LOG=info nohup /riffle/target/release/riffle-server --config config.toml > server1.log 2>&1 &
+RIFFLE_SERVER_1_PID=$!
+echo $RIFFLE_SERVER_1_PID > /tmp/riffle-server-1.pid
+echo_info "Riffle Server 1 started with PID: $RIFFLE_SERVER_1_PID"
+sleep 5
+
+# Start Riffle Server 2
+echo_info "Starting Riffle Server 2..."
+cd /tmp/riffle-server-2
+cp ${RIFFLE_HOME}/conf/riffle.conf.2 config.toml
+RUST_LOG=info nohup /riffle/target/release/riffle-server --config config.toml > server2.log 2>&1 &
+RIFFLE_SERVER_2_PID=$!
+echo $RIFFLE_SERVER_2_PID > /tmp/riffle-server-2.pid
+echo_info "Riffle Server 2 started with PID: $RIFFLE_SERVER_2_PID"
+sleep 5
+
+# Verify Riffle Servers are running
+echo_info "Verifying Riffle Servers..."
+sleep 3
+if curl -f http://localhost:19998/metrics >/dev/null 2>&1; then
+    echo_info "Riffle Server 1 is running"
+else
+    echo_warn "Riffle Server 1 metrics not ready"
+    echo_warn "=== Riffle Server 1 log (last 40 lines) ==="
+    tail -40 /tmp/riffle-server-1/server1.log 2>/dev/null || echo_warn "No log file found"
+    exit 1
+fi
+
+if curl -f http://localhost:19999/metrics >/dev/null 2>&1; then
+    echo_info "Riffle Server 2 is running"
+else
+    echo_warn "Riffle Server 2 metrics not ready"
+fi
+
+# Create Spark SQL test script
+echo_info "Creating Spark SQL test script..."
+cat > /tmp/spark_basic.scala << 'EOF'
+val data = sc.parallelize(1 to 100, 4)
+val pairs = data.map(x => (x % 5, x))
+val grouped = pairs.groupByKey()
+val result = grouped.mapValues(_.sum).collect().sortBy(_._1)
+result.foreach(println)
+EOF
+
+# Run Spark SQL Integration Test
+echo_info "Running Spark SQL Integration Test..."
+cd ${SPARK_HOME}
+
+# Run spark-submit
+if ./bin/spark-shell \
+    --master local[1] \
+    -i /tmp/spark_basic.scala; then
+    echo_info "Spark SQL test completed successfully!"
+    TEST_RESULT=0
+else
+    echo_error "Spark SQL test failed!"
+    TEST_RESULT=1
+fi
+
+exit $TEST_RESULT
diff --git a/dev/integration/riffle.conf.1 b/dev/integration/riffle.conf.1
@@ -0,0 +1,28 @@
+store_type = "MEMORY_LOCALFILE"
+grpc_port = 21100
+http_port = 19998
+coordinator_quorum = ["localhost:21000"]
+tags = ["riffle2", "datanode", "GRPC", "ss_v5"]
+
+[memory_store]
+capacity = "2G"
+dashmap_shard_amount = 128
+
+[localfile_store]
+data_paths = ["/tmp/riffle-server-1/data"]
+healthy_check_min_disks = 0
+disk_max_concurrency = 2000
+
+[hybrid_store]
+memory_spill_high_watermark = 0.5
+memory_spill_low_watermark = 0.2
+memory_spill_max_concurrency = 1000
+
+[runtime_config]
+read_thread_num = 20
+write_thread_num = 50
+grpc_thread_num = 50
+http_thread_num = 5
+default_thread_num = 10
+dispatch_thread_num = 5
+
diff --git a/dev/integration/riffle.conf.2 b/dev/integration/riffle.conf.2
@@ -0,0 +1,28 @@
+store_type = "MEMORY_LOCALFILE"
+grpc_port = 21101
+http_port = 19999
+coordinator_quorum = ["localhost:21000"]
+tags = ["riffle2", "datanode", "GRPC", "ss_v5"]
+
+[memory_store]
+capacity = "2G"
+dashmap_shard_amount = 128
+
+[localfile_store]
+data_paths = ["/tmp/riffle-server-2/data"]
+healthy_check_min_disks = 0
+disk_max_concurrency = 2000
+
+[hybrid_store]
+memory_spill_high_watermark = 0.5
+memory_spill_low_watermark = 0.2
+memory_spill_max_concurrency = 1000
+
+[runtime_config]
+read_thread_num = 20
+write_thread_num = 50
+grpc_thread_num = 50
+http_thread_num = 5
+default_thread_num = 10
+dispatch_thread_num = 5
+
diff --git a/dev/integration/spark-defaults.conf b/dev/integration/spark-defaults.conf
@@ -0,0 +1,14 @@
+spark.shuffle.manager org.apache.spark.shuffle.RssShuffleManager
+spark.rss.coordinator.quorum localhost:21000
+spark.rss.storage.type MEMORY_LOCALFILE
+spark.executor.instances 2
+spark.executor.cores 2
+spark.executor.memory 2g
+spark.sql.shuffle.partitions 4
+
+spark.serializer = org.apache.spark.serializer.KryoSerializer
+
+spark.rss.dynamicClientConf.enabled = false
+spark.rss.client.type = GRPC
+
+spark.rss.client.assignment.shuffle.nodes.max = 1