emqx
diff --git a/‎spark/.pyiceberg.yaml‎
Lines changed: 24 additions & 0 deletions b/‎spark/.pyiceberg.yaml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎spark/Dockerfile‎
Lines changed: 131 additions & 0 deletions b/‎spark/Dockerfile‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎spark/entrypoint.sh‎
Lines changed: 28 additions & 0 deletions b/‎spark/entrypoint.sh‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎spark/ipython/startup/00-prettytables.py‎
Lines changed: 81 additions & 0 deletions b/‎spark/ipython/startup/00-prettytables.py‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎spark/ipython/startup/README‎
Lines changed: 11 additions & 0 deletions b/‎spark/ipython/startup/README‎
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+catalog:
+    default:
+        uri: http://rest:8181
+        s3.endpoint: http://minio:9000
+        s3.access-key-id: admin
+        s3.secret-access-key: password
@@ -0,0 +1,131 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# syntax=docker/dockerfile:1
+FROM python:3.10-bullseye
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+      sudo \
+      curl \
+      vim \
+      unzip \
+      openjdk-17-jdk \
+      build-essential \
+      software-properties-common \
+      ssh && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Jupyter and other python deps
+COPY requirements.txt .
+RUN pip3 install -r requirements.txt
+
+# Add scala kernel via spylon-kernel
+RUN python3 -m spylon_kernel install
+
+# Download and install IJava jupyter kernel
+RUN curl https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3.0.zip -Lo ijava-1.3.0.zip \
+  && unzip ijava-1.3.0.zip \
+  && python3 install.py --sys-prefix \
+  && rm ijava-1.3.0.zip
+
+# Optional env variables
+ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
+ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
+
+WORKDIR ${SPARK_HOME}
+
+ENV SPARK_VERSION=3.5.5
+ENV SPARK_MAJOR_VERSION=3.5
+ENV ICEBERG_VERSION=1.8.1
+
+# Download spark
+RUN mkdir -p ${SPARK_HOME} \
+ && curl https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \
+ && tar xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \
+ && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz
+
+# Download iceberg spark runtime
+RUN curl https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR_VERSION}_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-${SPARK_MAJOR_VERSION}_2.12-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-spark-runtime-${SPARK_MAJOR_VERSION}_2.12-${ICEBERG_VERSION}.jar
+
+# Download AWS bundle
+RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-aws-bundle-${ICEBERG_VERSION}.jar
+
+# Download GCP bundle
+RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-gcp-bundle/${ICEBERG_VERSION}/iceberg-gcp-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-gcp-bundle-${ICEBERG_VERSION}.jar
+
+# Download Azure bundle
+RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-azure-bundle/${ICEBERG_VERSION}/iceberg-azure-bundle-${ICEBERG_VERSION}.jar -Lo /opt/spark/jars/iceberg-azure-bundle-${ICEBERG_VERSION}.jar
+
+# Install AWS CLI
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
+ && unzip awscliv2.zip \
+ && sudo ./aws/install \
+ && rm awscliv2.zip \
+ && rm -rf aws/
+
+# Add iceberg spark runtime jar to IJava classpath
+ENV IJAVA_CLASSPATH=/opt/spark/jars/*
+
+RUN mkdir -p /home/iceberg/data \
+ && curl https://data.cityofnewyork.us/resource/tg4x-b46p.json > /home/iceberg/data/nyc_film_permits.json \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-04.parquet -o /home/iceberg/data/yellow_tripdata_2022-04.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-03.parquet -o /home/iceberg/data/yellow_tripdata_2022-03.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet -o /home/iceberg/data/yellow_tripdata_2022-02.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -o /home/iceberg/data/yellow_tripdata_2022-01.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-12.parquet -o /home/iceberg/data/yellow_tripdata_2021-12.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-11.parquet -o /home/iceberg/data/yellow_tripdata_2021-11.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-10.parquet -o /home/iceberg/data/yellow_tripdata_2021-10.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-09.parquet -o /home/iceberg/data/yellow_tripdata_2021-09.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-08.parquet -o /home/iceberg/data/yellow_tripdata_2021-08.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-07.parquet -o /home/iceberg/data/yellow_tripdata_2021-07.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-06.parquet -o /home/iceberg/data/yellow_tripdata_2021-06.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-05.parquet -o /home/iceberg/data/yellow_tripdata_2021-05.parquet \
+ && curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-04.parquet -o /home/iceberg/data/yellow_tripdata_2021-04.parquet
+
+RUN mkdir -p /home/iceberg/localwarehouse /home/iceberg/notebooks /home/iceberg/warehouse /home/iceberg/spark-events /home/iceberg
+COPY notebooks/ /home/iceberg/notebooks
+
+# Add a notebook command
+RUN echo '#! /bin/sh' >> /bin/notebook \
+ && echo 'export PYSPARK_DRIVER_PYTHON=jupyter-notebook' >> /bin/notebook \
+ && echo "export PYSPARK_DRIVER_PYTHON_OPTS=\"--notebook-dir=/home/iceberg/notebooks --ip='*' --NotebookApp.token='' --NotebookApp.password='' --port=8888 --no-browser --allow-root\"" >> /bin/notebook \
+ && echo "pyspark" >> /bin/notebook \
+ && chmod u+x /bin/notebook
+
+# Add a pyspark-notebook command (alias for notebook command for backwards-compatibility)
+RUN echo '#! /bin/sh' >> /bin/pyspark-notebook \
+ && echo 'export PYSPARK_DRIVER_PYTHON=jupyter-notebook' >> /bin/pyspark-notebook \
+ && echo "export PYSPARK_DRIVER_PYTHON_OPTS=\"--notebook-dir=/home/iceberg/notebooks --ip='*' --NotebookApp.token='' --NotebookApp.password='' --port=8888 --no-browser --allow-root\"" >> /bin/pyspark-notebook \
+ && echo "pyspark" >> /bin/pyspark-notebook \
+ && chmod u+x /bin/pyspark-notebook
+
+RUN mkdir -p /root/.ipython/profile_default/startup
+COPY ipython/startup/00-prettytables.py /root/.ipython/profile_default/startup
+COPY ipython/startup/README /root/.ipython/profile_default/startup
+
+COPY spark-defaults.conf /opt/spark/conf
+ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}"
+
+RUN chmod u+x /opt/spark/sbin/* && \
+    chmod u+x /opt/spark/bin/*
+
+COPY .pyiceberg.yaml /root/.pyiceberg.yaml
+
+COPY entrypoint.sh .
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["notebook"]
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+start-master.sh -p 7077
+start-worker.sh spark://spark-iceberg:7077
+start-history-server.sh
+start-thriftserver.sh  --driver-java-options "-Dderby.system.home=/tmp/derby"
+
+# Entrypoint, for example notebook, pyspark or spark-sql
+if [[ $# -gt 0 ]] ; then
+    eval "$1"
+fi
@@ -0,0 +1,81 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from prettytable import PrettyTable
+from IPython.core.magic import register_line_cell_magic
+
+class DFTable(PrettyTable):
+    def __repr__(self):
+        return self.get_string()
+
+    def _repr_html_(self):
+        return self.get_html_string()
+
+def _row_as_table(df):
+    cols = df.columns
+
+    t = DFTable()
+    t.field_names = ["Column", "Value"]
+    t.align = "r"
+    row = df.limit(1).collect()[0].asDict()
+    for col in cols:
+        t.add_row([ col, row[col] ])
+
+    return t
+
+def _to_table(df, num_rows=100):
+    cols = df.columns
+
+    t = DFTable()
+    t.field_names = cols
+    t.align = "r"
+    for row in df.limit(num_rows).collect():
+        d = row.asDict()
+        t.add_row([ d[col] for col in cols ])
+
+    return t
+
+import re
+import sys
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("--limit", help="Number of lines to return", type=int, default=100)
+parser.add_argument("--var", help="Variable name to hold the dataframe", type=str)
+
+@register_line_cell_magic
+def sql(line, cell=None):
+    """Spark SQL magic
+    """
+    from pyspark.sql import SparkSession
+    spark = SparkSession.builder.appName("Jupyter").getOrCreate()
+    if cell is None:
+        return _to_table(spark.sql(line))
+    elif line:
+        df = spark.sql(cell)
+
+        (args, others) = parser.parse_known_args([ arg for arg in re.split("\s+", line) if arg ])
+
+        if args.var:
+            setattr(sys.modules[__name__], args.var, df)
+
+        if args.limit == 1:
+            return _row_as_table(df)
+        else:
+            return _to_table(df, num_rows=args.limit)
+    else:
+        return _to_table(spark.sql(cell))
@@ -0,0 +1,11 @@
+This is the IPython startup directory
+
+.py and .ipy files in this directory will be run *prior* to any code or files specified
+via the exec_lines or exec_files configurables whenever you load this profile.
+
+Files will be run in lexicographical order, so you can control the execution order of files
+with a prefix, e.g.::
+
+    00-first.py
+    50-middle.py
+    99-last.ipy