Skip to content

Commit 1a157af

Browse files
committed
[docs] Support HDFS as Remote Storage and Lake Warehouse for Quickstart
1 parent 453d64b commit 1a157af

File tree

1 file changed

+169
-80
lines changed

1 file changed

+169
-80
lines changed

website/docs/quickstart/lakehouse.md

Lines changed: 169 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -37,23 +37,61 @@ cd fluss-quickstart-paimon
3737

3838
```yaml
3939
services:
40+
#begin Hadoop cluster
41+
namenode:
42+
image: apache/hadoop:3.3.6
43+
hostname: namenode
44+
user: root
45+
command: [ "hdfs", "namenode" ]
46+
ports:
47+
- 9870:9870
48+
- 8020:8020
49+
environment:
50+
ENSURE_NAMENODE_DIR: "/tmp/hadoop/dfs/name"
51+
CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020
52+
CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp
53+
HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020
54+
HDFS-SITE.XML_dfs.replication: 1
55+
HDFS-SITE.XML_dfs.permissions.enabled: false
56+
HDFS-SITE.XML_dfs.datanode.address: datanode:9866
57+
healthcheck:
58+
test: ["CMD", "hdfs dfs -test -d / && exit 0 || exit 1"]
59+
interval: 15s
60+
timeout: 10s
61+
retries: 20
62+
63+
datanode:
64+
image: apache/hadoop:3.3.6
65+
user: root
66+
command: [ "hdfs", "datanode" ]
67+
environment:
68+
CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020
69+
CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp
70+
HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020
71+
HDFS-SITE.XML_dfs.replication: 1
72+
HDFS-SITE.XML_dfs.permissions.enabled: false
73+
HDFS-SITE.XML_dfs.datanode.address: datanode:9866
74+
depends_on:
75+
- namenode
76+
#end
4077
#begin Fluss cluster
4178
coordinator-server:
4279
image: apache/fluss:$FLUSS_DOCKER_VERSION$
4380
command: coordinatorServer
4481
depends_on:
45-
- zookeeper
82+
namenode:
83+
condition: service_healthy
84+
zookeeper:
85+
condition: service_started
4686
environment:
4787
- |
4888
FLUSS_PROPERTIES=
4989
zookeeper.address: zookeeper:2181
5090
bind.listeners: FLUSS://coordinator-server:9123
51-
remote.data.dir: /tmp/fluss/remote-data
91+
remote.data.dir: hdfs://namenode:8020/fluss-data
5292
datalake.format: paimon
5393
datalake.paimon.metastore: filesystem
54-
datalake.paimon.warehouse: /tmp/paimon
55-
volumes:
56-
- shared-tmpfs:/tmp/paimon
94+
datalake.paimon.warehouse: hdfs://namenode:8020/fluss-lake
5795
tablet-server:
5896
image: apache/fluss:$FLUSS_DOCKER_VERSION$
5997
command: tabletServer
@@ -65,13 +103,11 @@ services:
65103
zookeeper.address: zookeeper:2181
66104
bind.listeners: FLUSS://tablet-server:9123
67105
data.dir: /tmp/fluss/data
68-
remote.data.dir: /tmp/fluss/remote-data
69-
kv.snapshot.interval: 0s
106+
remote.data.dir: hdfs://namenode:8020/fluss-data
107+
kv.snapshot.interval: 30s
70108
datalake.format: paimon
71109
datalake.paimon.metastore: filesystem
72-
datalake.paimon.warehouse: /tmp/paimon
73-
volumes:
74-
- shared-tmpfs:/tmp/paimon
110+
datalake.paimon.warehouse: hdfs://namenode:8020/fluss-lake
75111
zookeeper:
76112
restart: always
77113
image: zookeeper:3.9.2
@@ -86,8 +122,6 @@ services:
86122
- |
87123
FLINK_PROPERTIES=
88124
jobmanager.rpc.address: jobmanager
89-
volumes:
90-
- shared-tmpfs:/tmp/paimon
91125
taskmanager:
92126
image: apache/fluss-quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$
93127
depends_on:
@@ -100,16 +134,7 @@ services:
100134
taskmanager.numberOfTaskSlots: 10
101135
taskmanager.memory.process.size: 2048m
102136
taskmanager.memory.framework.off-heap.size: 256m
103-
volumes:
104-
- shared-tmpfs:/tmp/paimon
105137
#end
106-
107-
volumes:
108-
shared-tmpfs:
109-
driver: local
110-
driver_opts:
111-
type: "tmpfs"
112-
device: "tmpfs"
113138
```
114139

115140
The Docker Compose environment consists of the following containers:
@@ -183,22 +208,60 @@ services:
183208
zookeeper:
184209
restart: always
185210
image: zookeeper:3.9.2
211+
212+
namenode:
213+
image: apache/hadoop:3.3.6
214+
hostname: namenode
215+
user: root
216+
command: [ "hdfs", "namenode" ]
217+
ports:
218+
- 9870:9870
219+
- 8020:8020
220+
environment:
221+
ENSURE_NAMENODE_DIR: "/tmp/hadoop/dfs/name"
222+
CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020
223+
CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp
224+
HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020
225+
HDFS-SITE.XML_dfs.replication: 1
226+
HDFS-SITE.XML_dfs.permissions.enabled: false
227+
HDFS-SITE.XML_dfs.datanode.address: datanode:9866
228+
healthcheck:
229+
test: ["CMD", "hdfs dfs -test -d / && exit 0 || exit 1"]
230+
interval: 15s
231+
timeout: 10s
232+
retries: 20
233+
234+
datanode:
235+
image: apache/hadoop:3.3.6
236+
user: root
237+
command: [ "hdfs", "datanode" ]
238+
environment:
239+
CORE-SITE.XML_fs.defaultFS: hdfs://namenode:8020
240+
CORE-SITE.XML_hadoop.tmp.dir: /hadoop/tmp
241+
HDFS-SITE.XML_dfs.namenode.rpc-address: namenode:8020
242+
HDFS-SITE.XML_dfs.replication: 1
243+
HDFS-SITE.XML_dfs.permissions.enabled: false
244+
HDFS-SITE.XML_dfs.datanode.address: datanode:9866
245+
depends_on:
246+
- namenode
186247

187248
coordinator-server:
188249
image: apache/fluss:$FLUSS_DOCKER_VERSION$
189250
depends_on:
190-
- zookeeper
251+
namenode:
252+
condition: service_healthy
253+
zookeeper:
254+
condition: service_started
191255
environment:
192256
- |
193257
FLUSS_PROPERTIES=
194258
zookeeper.address: zookeeper:2181
195259
bind.listeners: FLUSS://coordinator-server:9123
196-
remote.data.dir: /tmp/fluss/remote-data
260+
remote.data.dir: hdfs://namenode:8020/fluss-data
197261
datalake.format: iceberg
198262
datalake.iceberg.type: hadoop
199-
datalake.iceberg.warehouse: /tmp/iceberg
263+
datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake
200264
volumes:
201-
- shared-tmpfs:/tmp/iceberg
202265
- ./lib:/tmp/lib
203266
entrypoint: ["sh", "-c", "cp -v /tmp/lib/*.jar /opt/fluss/plugins/iceberg/ && exec /docker-entrypoint.sh coordinatorServer"]
204267

@@ -213,13 +276,11 @@ services:
213276
zookeeper.address: zookeeper:2181
214277
bind.listeners: FLUSS://tablet-server:9123
215278
data.dir: /tmp/fluss/data
216-
remote.data.dir: /tmp/fluss/remote-data
217-
kv.snapshot.interval: 0s
279+
remote.data.dir: hdfs://namenode:8020/fluss-data
280+
kv.snapshot.interval: 30s
218281
datalake.format: iceberg
219282
datalake.iceberg.type: hadoop
220-
datalake.iceberg.warehouse: /tmp/iceberg
221-
volumes:
222-
- shared-tmpfs:/tmp/iceberg
283+
datalake.iceberg.warehouse: hdfs://namenode:8020/fluss-lake
223284
224285
jobmanager:
225286
image: apache/fluss-quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$
@@ -230,8 +291,6 @@ services:
230291
- |
231292
FLINK_PROPERTIES=
232293
jobmanager.rpc.address: jobmanager
233-
volumes:
234-
- shared-tmpfs:/tmp/iceberg
235294
236295
taskmanager:
237296
image: apache/fluss-quickstart-flink:1.20-$FLUSS_DOCKER_VERSION$
@@ -245,15 +304,6 @@ services:
245304
taskmanager.numberOfTaskSlots: 10
246305
taskmanager.memory.process.size: 2048m
247306
taskmanager.memory.framework.off-heap.size: 256m
248-
volumes:
249-
- shared-tmpfs:/tmp/iceberg
250-
251-
volumes:
252-
shared-tmpfs:
253-
driver: local
254-
driver_opts:
255-
type: "tmpfs"
256-
device: "tmpfs"
257307
```
258308
259309
The Docker Compose environment consists of the following containers:
@@ -636,34 +686,6 @@ The result looks like:
636686
```
637687
You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time.
638688

639-
Finally, you can use the following command to view the files stored in Paimon:
640-
```shell
641-
docker compose exec taskmanager tree /tmp/paimon/fluss.db
642-
```
643-
644-
**Sample Output:**
645-
```shell
646-
/tmp/paimon/fluss.db
647-
└── datalake_enriched_orders
648-
├── bucket-0
649-
│ ├── changelog-aef1810f-85b2-4eba-8eb8-9b136dec5bdb-0.orc
650-
│ └── data-aef1810f-85b2-4eba-8eb8-9b136dec5bdb-1.orc
651-
├── manifest
652-
│ ├── manifest-aaa007e1-81a2-40b3-ba1f-9df4528bc402-0
653-
│ ├── manifest-aaa007e1-81a2-40b3-ba1f-9df4528bc402-1
654-
│ ├── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-0
655-
│ ├── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-1
656-
│ └── manifest-list-ceb77e1f-7d17-4160-9e1f-f334918c6e0d-2
657-
├── schema
658-
│ └── schema-0
659-
└── snapshot
660-
├── EARLIEST
661-
├── LATEST
662-
└── snapshot-1
663-
```
664-
665-
The files adhere to Paimon's standard format, enabling seamless querying with other engines such as [Spark](https://paimon.apache.org/docs/1.3/spark/quick-start/) and [Trino](https://paimon.apache.org/docs/1.3/ecosystem/trino/).
666-
667689
</TabItem>
668690

669691
<TabItem value="iceberg" label="Iceberg">
@@ -730,22 +752,89 @@ SELECT sum(total_price) as sum_price FROM datalake_enriched_orders;
730752

731753
You can execute the real-time analytics query multiple times, and the results will vary with each run as new data is continuously written to Fluss in real-time.
732754

733-
Finally, you can use the following command to view the files stored in Iceberg:
755+
</TabItem>
756+
</Tabs>
757+
758+
### Remote Storage
759+
760+
Finally, you can use the following command to view the fluss kv snapshot stored in fluss remote storage:
761+
```shell
762+
docker compose exec namenode hdfs dfs -ls -R /fluss-data/ | awk '{print $8}' | grep -v '^$' | tree --fromfile .
763+
```
764+
765+
**Sample Output:**
766+
```shell
767+
hdfs://namenode:8020/fluss-data
768+
└── kv
769+
└── fluss
770+
├── enriched_orders-3
771+
│ └── 0
772+
│ ├── shared
773+
│ │ ├── 71fca534-ecca-489b-a19a-bd0538c9f9e9
774+
│ │ ├── b06ef3a3-2873-470e-961f-da25582136a1
775+
│ │ └── b93bad5c-00fb-4e62-8217-71b010621479
776+
│ └── snap-2
777+
│ ├── _METADATA
778+
│ ├── 08d39726-f847-4401-8f31-4e905f2ba3f6
779+
│ ├── b6a7bc2c-b5c3-4eeb-a523-b2b6fff159f3
780+
│ └── e6278555-d71f-431f-954e-71bf066dd29f
781+
├── fluss_customer-1
782+
... # Remaining entries omitted for brevity
783+
```
784+
785+
### Lake Storage
786+
787+
<Tabs groupId="lake-tabs">
788+
<TabItem value="paimon" label="Paimon" default>
789+
790+
Finally, you can use the following command to view the files stored in Paimon Hadoop warehouse:
734791
```shell
735-
docker compose exec taskmanager tree /tmp/iceberg/fluss
792+
docker compose exec namenode hdfs dfs -ls -R /fluss-lake/ | awk '{print $8}' | grep -v '^$' | tree --fromfile .
736793
```
737794

738795
**Sample Output:**
739796
```shell
740-
/tmp/iceberg/fluss
741-
└── datalake_enriched_orders
742-
├── data
743-
│ └── 00000-0-abc123.parquet
744-
└── metadata
745-
├── snap-1234567890123456789-1-abc123.avro
746-
└── v1.metadata.json
747-
```
748-
The files adhere to Iceberg's standard format, enabling seamless querying with other engines such as [Spark](https://iceberg.apache.org/docs/latest/spark-queries/) and [Trino](https://trino.io/docs/current/connector/iceberg.html).
797+
hdfs://namenode:8020/fluss-lake
798+
├── default.db
799+
└── fluss.db
800+
└── datalake_enriched_orders
801+
├── bucket-0
802+
│ └── data-02acf76d-c4cc-4bc1-9292-e64a77dfcc72-0.parquet
803+
├── manifest
804+
│ ├── manifest-df5b6833-7e92-4ec9-a196-51d6fd60b1d1-0
805+
│ ├── manifest-list-b683c5a2-4072-4c7a-8586-2c853de8d964-0
806+
│ └── manifest-list-b683c5a2-4072-4c7a-8586-2c853de8d964-1
807+
├── schema
808+
│ └── schema-0
809+
└── snapshot
810+
├── LATEST
811+
└── snapshot-1
812+
```
813+
814+
</TabItem>
815+
816+
<TabItem value="iceberg" label="Iceberg">
817+
818+
Finally, you can use the following command to view the files stored in Iceberg Hadoop warehouse:
819+
```shell
820+
docker compose exec namenode hdfs dfs -ls -R /fluss-lake/ | awk '{print $8}' | grep -v '^$' | tree --fromfile .
821+
```
822+
823+
**Sample Output:**
824+
```shell
825+
hdfs://namenode:8020/fluss-lake
826+
└── fluss
827+
└── datalake_enriched_orders
828+
├── data
829+
│ └── __bucket=0
830+
│ └── 00000-0-3ff95845-47af-456f-83e0-8411576cfffe-00001.parquet
831+
└── metadata
832+
├── 528ae521-d683-4c5e-8dd7-779a83dd9c6f-m0.avro
833+
├── snap-3496049107217731071-1-528ae521-d683-4c5e-8dd7-779a83dd9c6f.avro
834+
├── v1.metadata.json
835+
├── v2.metadata.json
836+
└── version-hint.text
837+
```
749838

750839
</TabItem>
751840
</Tabs>

0 commit comments

Comments
 (0)