Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions hudi-notebooks/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@

set -eux

export HUDI_VERSION=${HUDI_VERSION:-1.0.2}
export HUDI_VERSION=1.0.2
export HUDI_VERSION_TAG=${HUDI_VERSION}
export SPARK_VERSION=${SPARK_VERSION:-3.5.7}
export HIVE_VERSION=${HIVE_VERSION:-3.1.3}
export HIVE_VERSION_TAG=${HIVE_VERSION_TAG:-3.1.3}
export SPARK_VERSION=3.5.7
export HIVE_VERSION=3.1.3
export HIVE_VERSION_TAG=${HIVE_VERSION}

SCRIPT_DIR=$(cd $(dirname $0); pwd)

Expand All @@ -40,4 +40,4 @@ docker build \
--build-arg HIVE_VERSION="$HIVE_VERSION" \
-t apachehudi/hive:latest \
-t apachehudi/hive:"$HIVE_VERSION_TAG" \
-f "$SCRIPT_DIR"/Dockerfile.hive .
-f "$SCRIPT_DIR"/Dockerfile.hive .
6 changes: 3 additions & 3 deletions hudi-notebooks/notebooks/02-query-types.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@
"outputs": [],
"source": [
"table_name_cow = \"trips_table_cow\"\n",
"base_path = f\"s3a://warehouse/hudi-query-types/{table_name_cow}\"\n",
"base_path = f\"s3a://warehouse/hudi-query-types\"\n",
"\n",
"cow_hudi_conf = {\n",
" \"hoodie.table.name\": table_name_cow, # The name of our Hudi table.\n",
Expand Down Expand Up @@ -395,7 +395,7 @@
"source": [
"spark.read.format(\"hudi\") \\\n",
" .option(\"as.of.instant\", beginTime) \\\n",
" .load(base_path).createOrReplaceTempView(\"trips_time_travel\")"
" .load(f\"{base_path}/{table_name_cow}\").createOrReplaceTempView(\"trips_time_travel\")"
]
},
{
Expand Down Expand Up @@ -565,7 +565,7 @@
"outputs": [],
"source": [
"table_name_mor = \"trips_table_mor\"\n",
"base_path = f\"s3a://warehouse/hudi-query-types/{table_name_mor}\"\n",
"base_path = f\"s3a://warehouse/hudi-query-types\"\n",
"\n",
"mor_hudi_conf = {\n",
" \"hoodie.table.name\": table_name_mor,\n",
Expand Down
25 changes: 17 additions & 8 deletions hudi-notebooks/notebooks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def get_spark_session(app_name="Hudi-Notebooks"):

spark_session = SparkSession.builder \
.appName(app_name) \
.config("spark.hadoop.fs.defaultFS", "s3a://warehouse") \
.enableHiveSupport() \
.getOrCreate()

Expand All @@ -40,6 +41,9 @@ def get_spark_session(app_name="Hudi-Notebooks"):

return spark_session

# Initialize Spark globally so other functions can use it
spark = get_spark_session()

# S3 Utility Function
def ls(base_path):
"""
Expand All @@ -50,13 +54,18 @@ def ls(base_path):
if not base_path.startswith("s3a://"):
raise ValueError("Path must start with 's3a://'")
try:
parsed = urlparse(base_path)
bucket_name = parsed.netloc
prefix = parsed.path.lstrip("/")
s3_client = boto3.client('s3')
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
for obj in response['Contents']:
print(f"s3a://{bucket_name}/{obj['Key']}")
hadoop_conf = spark._jsc.hadoopConfiguration()
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_conf)
p = spark._jvm.org.apache.hadoop.fs.Path(base_path)

if not fs.exists(p):
print(f"Path does not exist: {base_path}")
return []

status = fs.listStatus(p)
files = [str(file.getPath()) for file in status]
for f in files:
print(f)
except Exception as e:
print(f"Exception occurred while listing files from path {base_path}", e)

Expand Down Expand Up @@ -120,4 +129,4 @@ def display(df, num_rows=100):
"""

# Display the final HTML
display_html(HTML(custom_css + html_table))
display_html(HTML(custom_css + html_table))
Loading