-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstarrocks-tpch-data-load-job.yaml
More file actions
217 lines (180 loc) · 7.79 KB
/
starrocks-tpch-data-load-job.yaml
File metadata and controls
217 lines (180 loc) · 7.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
apiVersion: batch/v1
kind: Job
metadata:
name: tpch-data-load
namespace: starrocks
labels:
app: tpch-data-load
spec:
template:
spec:
containers:
- name: data-loader
image: public.ecr.aws/amazonlinux/amazonlinux:2
command: ["/bin/bash"]
args:
- "-c"
- |
set -e
echo "Installing required packages..."
yum install -y mysql python3 python3-pip
echo "Installing Python dependencies..."
pip3 install pymysql pandas
echo "Testing StarRocks connection..."
FE_HOST="starrocks-karpenter-shared-nothing-fe-service.starrocks.svc.cluster.local"
mysql -h $FE_HOST -P 9030 -u root -e "SHOW DATABASES;" || exit 1
echo "Checking existing TPC-H data in PVC..."
cd /data
ls -la
if [ ! -d "tpch-poc-1.0" ]; then
echo "ERROR: tpch-poc-1.0 directory not found in PVC!"
echo "Please run the data generation job first."
exit 1
fi
cd tpch-poc-1.0
echo "Verifying TPC-H data files..."
if [ ! -d "data_1000" ]; then
echo "ERROR: data_1000 directory not found!"
echo "Please run the data generation job first."
exit 1
fi
echo "TPC-H data files found:"
ls -la data_1000/
echo "File sizes:"
du -h data_1000/* | head -20
echo "Creating TPC-H database and tables using correct DDL for SF1000..."
# Use the DDL for 1000 scale factor which matches the data format
mysql -h $FE_HOST -P 9030 -u root < sql/tpch/ddl_1000/tpch_create.sql
echo "Verifying database and tables creation..."
mysql -h $FE_HOST -P 9030 -u root -e "USE tpch; SHOW TABLES;"
echo "Configuring StarRocks for large data loading..."
mysql -h $FE_HOST -P 9030 -u root << 'EOSQL'
-- Show current settings (no configuration changes needed)
SHOW VARIABLES LIKE '%timeout%';
SHOW VARIABLES LIKE '%mem_limit%';
SELECT 'StarRocks configuration check completed' as status;
EOSQL
echo "Configuring StarRocks connection for stream load..."
# Create a clean configuration file
cat > conf/starrocks.conf << EOF
[starrocks]
# for mysql cmd
mysql_host: ${FE_HOST}
mysql_port: 9030
mysql_user: root
mysql_password:
database: tpch
# cluster ports
http_port: 8030
be_heartbeat_port: 9050
broker_port: 8000
parallel_num: 8
concurrency_num: 1
num_of_queries: 1
sleep_ms: 500
[broker_load]
broker: starrocks
broker_username: hdfs_broker
broker_password:
hadoop_home: /tools/hadoop-2.7.7
# data size per broker load job: 500M
max_bytes_per_job: 524288000
# format and column separator
file_format: orc
column_separator:
# job properties
# The maximum tolerance rate of job. 0 ~ 1 (increased for large datasets)
max_filter_ratio: 0.5
timeout: 14400
EOF
echo "Configuration file created:"
cat conf/starrocks.conf
echo "Setting up environment for stream_load.sh..."
# Make sure the scripts are executable
chmod +x bin/stream_load.sh
chmod +x bin/common_info.sh
echo "Patching stream_load to add strict_mode=false and max_filter_ratio..."
# Backup original file
cp src/lib/starrocks_lib.py src/lib/starrocks_lib.py.backup
# Patch the get_stream_load_cmd function to include strict_mode=false and max_filter_ratio
sed -i 's/-H "column_separator:|"/-H "column_separator:|" -H "strict_mode:false" -H "max_filter_ratio:0.5"/g' src/lib/starrocks_lib.py
echo "Patched stream_load command to include strict_mode=false and max_filter_ratio=0.5"
echo "Loading data using tpch-poc stream_load.sh..."
# Ensure we're in the correct directory
cd /data/tpch-poc-1.0
pwd
# Verify the script exists and is executable
ls -la bin/stream_load.sh
# Verify the data directory and files
echo "Data directory contents:"
ls -la data_1000/ | head -10
echo "Checking data format (first few lines of lineitem file):"
head -5 data_1000/lineitem.tbl.1 || echo "Could not read lineitem.tbl.1"
echo "Checking lineitem table schema in StarRocks:"
mysql -h $FE_HOST -P 9030 -u root -e "USE tpch; DESC lineitem;"
echo "Analyzing data format issue..."
echo "The error shows NULL values in l_orderkey column, indicating column mapping issue"
# Use patched stream_load with strict_mode=false and max_filter_ratio
echo "Executing: bash bin/stream_load.sh data_1000"
bash bin/stream_load.sh data_1000
echo "Data loading completed. Verifying results..."
mysql -h $FE_HOST -P 9030 -u root << 'EOSQL'
USE tpch;
SELECT 'TPC-H Data Loading Summary' as status;
SELECT '=============================' as separator;
SELECT 'region' as table_name, COUNT(*) as row_count FROM region
UNION ALL
SELECT 'nation' as table_name, COUNT(*) as row_count FROM nation
UNION ALL
SELECT 'supplier' as table_name, COUNT(*) as row_count FROM supplier
UNION ALL
SELECT 'customer' as table_name, COUNT(*) as row_count FROM customer
UNION ALL
SELECT 'part' as table_name, COUNT(*) as row_count FROM part
UNION ALL
SELECT 'partsupp' as table_name, COUNT(*) as row_count FROM partsupp
UNION ALL
SELECT 'orders' as table_name, COUNT(*) as row_count FROM orders
UNION ALL
SELECT 'lineitem' as table_name, COUNT(*) as row_count FROM lineitem
ORDER BY
CASE table_name
WHEN 'region' THEN 1
WHEN 'nation' THEN 2
WHEN 'supplier' THEN 3
WHEN 'customer' THEN 4
WHEN 'part' THEN 5
WHEN 'partsupp' THEN 6
WHEN 'orders' THEN 7
WHEN 'lineitem' THEN 8
END;
SELECT 'Expected row counts for TPC-H SF10:' as info;
SELECT 'region: 5, nation: 25, supplier: 100000' as expected_small;
SELECT 'customer: 1500000, part: 2000000' as expected_medium;
SELECT 'partsupp: 8000000, orders: 15000000, lineitem: ~60000000' as expected_large;
SELECT 'Data loading verification completed' as status;
EOSQL
echo "TPC-H data loading completed successfully!"
echo "Database: tpch"
echo "Scale Factor: 1000 (1TB)"
echo "Data source: PVC generated data (data_1000/)"
echo "Loading method: tpch-poc-1.0 stream_load.sh"
echo "Ready for benchmark execution!"
resources:
requests:
cpu: "4"
memory: "16Gi"
limits:
cpu: "4"
memory: "16Gi"
volumeMounts:
- name: data-storage
mountPath: /data
volumes:
- name: data-storage
persistentVolumeClaim:
claimName: tpch-data-1tb-pvc
restartPolicy: Never
nodeSelector:
kubernetes.io/arch: amd64
backoffLimit: 3