Skip to content

Commit 15cea15

Browse files
committed
[docs] Support HDFS as Remote Storage and Lake Warehouse for Quickstart
1 parent f0a13fe commit 15cea15

File tree

1 file changed

+110
-30
lines changed

1 file changed

+110
-30
lines changed

website/docs/quickstart/flink-iceberg.md

Lines changed: 110 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -61,23 +61,55 @@ services:
6161
restart: always
6262
image: zookeeper:3.9.2
6363

64+
namenode:
65+
image: apache/hadoop:3.3.6
66+
hostname: namenode
67+
user: root
68+
command: [ "hdfs", "namenode" ]
69+
ports:
70+
- 9870:9870
71+
- 8020:8020
72+
environment:
73+
ENSURE_NAMENODE_DIR: "/tmp/hadoop/dfs/name"
74+
CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020
75+
CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp
76+
HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020
77+
HDFS-SITE.XML_dfs.replication: 1
78+
HDFS-SITE.XML_dfs.permissions.enabled: false
79+
HDFS-SITE.XML_dfs.datanode.address: datanode:9866
80+
81+
datanode:
82+
image: apache/hadoop:3.3.6
83+
user: root
84+
command: [ "hdfs", "datanode" ]
85+
environment:
86+
CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020
87+
CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp
88+
HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020
89+
HDFS-SITE.XML_dfs.replication: 1
90+
HDFS-SITE.XML_dfs.permissions.enabled: false
91+
HDFS-SITE.XML_dfs.datanode.address: datanode:9866
92+
depends_on:
93+
- namenode
94+
6495
coordinator-server:
6596
image: fluss/fluss:$FLUSS_DOCKER_VERSION$
6697
depends_on:
6798
- zookeeper
99+
- namenode
100+
- datanode
68101
environment:
69102
- |
70103
FLUSS_PROPERTIES=
71104
zookeeper.address: zookeeper:2181
72105
bind.listeners: FLUSS://coordinator-server:9123
73-
remote.data.dir: /tmp/fluss/remote-data
106+
remote.data.dir: hdfs://namenode:8020/fluss-data
74107
datalake.format: iceberg
75108
datalake.iceberg.type: hadoop
76-
datalake.iceberg.warehouse: /tmp/iceberg
109+
datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake
77110
volumes:
78-
- shared-tmpfs:/tmp/iceberg
79111
- ./lib:/tmp/lib
80-
entrypoint: ["sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer"]
112+
entrypoint: [ "sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer" ]
81113

82114
tablet-server:
83115
image: fluss/fluss:$FLUSS_DOCKER_VERSION$
@@ -90,13 +122,11 @@ services:
90122
zookeeper.address: zookeeper:2181
91123
bind.listeners: FLUSS://tablet-server:9123
92124
data.dir: /tmp/fluss/data
93-
remote.data.dir: /tmp/fluss/remote-data
94-
kv.snapshot.interval: 0s
125+
remote.data.dir: hdfs://namenode:8020/fluss-data
126+
kv.snapshot.interval: 10s
95127
datalake.format: iceberg
96128
datalake.iceberg.type: hadoop
97-
datalake.iceberg.warehouse: /tmp/iceberg
98-
volumes:
99-
- shared-tmpfs:/tmp/iceberg
129+
datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake
100130
101131
jobmanager:
102132
image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$
@@ -107,8 +137,6 @@ services:
107137
- |
108138
FLINK_PROPERTIES=
109139
jobmanager.rpc.address: jobmanager
110-
volumes:
111-
- shared-tmpfs:/tmp/iceberg
112140
113141
taskmanager:
114142
image: fluss/quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$
@@ -122,15 +150,6 @@ services:
122150
taskmanager.numberOfTaskSlots: 10
123151
taskmanager.memory.process.size: 2048m
124152
taskmanager.memory.framework.off-heap.size: 256m
125-
volumes:
126-
- shared-tmpfs:/tmp/iceberg
127-
128-
volumes:
129-
shared-tmpfs:
130-
driver: local
131-
driver_opts:
132-
type: "tmpfs"
133-
device: "tmpfs"
134153
```
135154
136155
The Docker Compose environment consists of the following containers:
@@ -367,6 +386,60 @@ The following SQL query should return an empty result.
367386
SELECT * FROM fluss_customer WHERE `cust_key` = 1;
368387
```
369388

389+
### Storage
390+
391+
Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage:
392+
```shell
393+
docker compose exec namenode hdfs dfs -ls -R /fluss-data/ | awk '{print $8}' | grep -v '^$' | tree --fromfile .
394+
```
395+
396+
**Sample Output:**
397+
```shell
398+
hdfs://namenode:8020/fluss-data
399+
└── kv
400+
└── fluss
401+
├── enriched_orders-3
402+
│ └── 0
403+
│ ├── shared
404+
│ │ ├── 0836f202-bdcd-498b-a94a-0520beb3d7ea
405+
│ │ ├── afefc29f-d8d3-4cdb-a496-a6c271ddfac0
406+
│ │ └── b67bd402-2ad4-4305-bd36-4fadf08a5200
407+
│ └── snap-2
408+
│ ├── _METADATA
409+
│ ├── 02f02528-af03-4c88-980c-ec9f878d5476
410+
│ ├── 7b21a889-ab06-4b74-98a5-36b542a67d0d
411+
│ └── d7b699d9-6547-49fc-b579-de84cc37a167
412+
├── fluss_customer-1
413+
│ └── 0
414+
│ ├── shared
415+
│ │ └── e537da08-ad8c-478d-9b80-505616e481b9
416+
│ └── snap-0
417+
│ ├── _METADATA
418+
│ ├── 34a2dea2-8079-483f-b6d8-003a6e01bd3f
419+
│ ├── b196e58d-7df2-4c95-b4fb-b1c3ebb4c622
420+
│ └── f6ff8a01-d7dc-451d-a0e7-716392808405
421+
├── fluss_nation-2
422+
│ └── 0
423+
│ ├── shared
424+
│ │ └── fd7c1369-fa5d-4396-9610-216f0affb213
425+
│ └── snap-0
426+
│ ├── _METADATA
427+
│ ├── 0808acec-65d4-451c-b8fd-2225f045ad6d
428+
│ ├── 181f67c8-0620-4e9a-9367-2a1c774b4abd
429+
│ └── 5690e21f-3322-486c-93f5-5669d595cf34
430+
└── fluss_order-0
431+
└── 0
432+
├── shared
433+
│ ├── 29affd1e-ca99-4cc5-a855-61d3c1b2c9a5
434+
│ ├── 546a6113-0a63-4a17-964c-d8e37c32acc0
435+
│ └── 6d9b6ddb-a2c1-4746-aedd-3217a9f51686
436+
└── snap-2
437+
├── _METADATA
438+
├── 3fb7b4e9-9d66-44b6-b846-dba77514a1c7
439+
├── 450997ac-8323-4708-a4ac-6bb2e71834a7
440+
└── f2b4b230-41e9-41dd-9227-337e324460a6
441+
```
442+
370443
## Integrate with Iceberg
371444
### Start the Lakehouse Tiering Service
372445
To integrate with [Apache Iceberg](https://iceberg.apache.org/), you need to start the `Lakehouse Tiering Service`.
@@ -378,7 +451,7 @@ docker compose exec jobmanager \
378451
--fluss.bootstrap.servers coordinator-server:9123 \
379452
--datalake.format iceberg \
380453
--datalake.iceberg.type hadoop \
381-
--datalake.iceberg.warehouse /tmp/iceberg
454+
--datalake.iceberg.warehouse hdfs://namenode:8020/fluss-lake
382455
```
383456
You should see a Flink Job to tier data from Fluss to Iceberg running in the [Flink Web UI](http://localhost:8083/).
384457

@@ -501,20 +574,27 @@ SELECT sum(total_price) as sum_price FROM datalake_enriched_orders;
501574

502575
You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time.
503576

504-
Finally, you can use the following command to view the files stored in Iceberg:
577+
### Storage
578+
579+
Finally, you can use the following command to view the files stored in Iceberg Hadoop warehouse:
505580
```shell
506-
docker compose exec taskmanager tree /tmp/iceberg/fluss
581+
docker compose exec namenode hdfs dfs -ls -R /fluss-lake/ | awk '{print $8}' | grep -v '^$' | tree --fromfile .
507582
```
508583

509584
**Sample Output:**
510585
```shell
511-
/tmp/iceberg/fluss
512-
└── datalake_enriched_orders
513-
├── data
514-
│ └── 00000-0-abc123.parquet
515-
└── metadata
516-
├── snap-1234567890123456789-1-abc123.avro
517-
└── v1.metadata.json
586+
hdfs://namenode:8020/fluss-lake
587+
└── fluss
588+
└── datalake_enriched_orders
589+
├── data
590+
│ └── __bucket=0
591+
│ └── 00000-0-3ff95845-47af-456f-83e0-8411576cfffe-00001.parquet
592+
└── metadata
593+
├── 528ae521-d683-4c5e-8dd7-779a83dd9c6f-m0.avro
594+
├── snap-3496049107217731071-1-528ae521-d683-4c5e-8dd7-779a83dd9c6f.avro
595+
├── v1.metadata.json
596+
├── v2.metadata.json
597+
└── version-hint.text
518598
```
519599
The files adhere to Iceberg's standard format, enabling seamless querying with other engines such as [Spark](https://iceberg.apache.org/docs/latest/spark-queries/) and [Trino](https://trino.io/docs/current/connector/iceberg.html).
520600

0 commit comments

Comments
 (0)