-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtpch-data-gen-job.yaml
More file actions
81 lines (72 loc) · 2.6 KB
/
tpch-data-gen-job.yaml
File metadata and controls
81 lines (72 loc) · 2.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
apiVersion: batch/v1
kind: Job
metadata:
name: tpch-data-generator
namespace: starrocks
labels:
app: tpch-data-gen
spec:
template:
metadata:
labels:
app: tpch-data-gen
spec:
#serviceAccountName: tpch-s3-service-account
containers:
- name: data-generator
image: public.ecr.aws/amazonlinux/amazonlinux:2
command: ["/bin/bash"]
args:
- "-c"
- |
set -e
echo "Installing required packages..."
yum install -y wget unzip
echo "Downloading tpch-poc tool package..."
cd /data
# Clean up everything in /data directory to ensure fresh start
rm -rf *
wget https://starrocks-public.oss-cn-zhangjiakou.aliyuncs.com/tpch-poc-1.0.zip
echo "Extracting tpch-poc package..."
unzip tpch-poc-1.0.zip
cd tpch-poc-1.0
echo "Checking directory structure..."
ls -la
ls -la bin/gen_data/ || echo "gen_data directory not found"
ls -la ../common_info.sh || echo "common_info.sh not found in parent"
ls -la common_info.sh || echo "common_info.sh not found in current"
pwd
echo "Starting TPC-H data generation with scale factor 1000..."
# Fix the path issue - common_info.sh is in bin/ directory, not parent
# Create a symlink to fix the path reference
ln -sf bin/common_info.sh common_info.sh
# Modify gen-tpch.sh to add -f flag to dbgen calls to force overwrite
sed -i 's/dbgen -s/dbgen -f -s/g' bin/gen_data/gen-tpch.sh
# Verify the modification worked
echo "Modified dbgen calls:"
grep "dbgen -f" bin/gen_data/gen-tpch.sh
# Make sure we're in the right directory and use bash explicitly
chmod +x bin/gen_data/gen-tpch.sh
bash bin/gen_data/gen-tpch.sh 1000 data_1000
echo "TPC-H data generation completed!"
echo "Generated data is available in: /data/tpch-poc-1.0/data_1000"
ls -la data_1000/
echo "Job completed successfully!"
resources:
requests:
cpu: "4"
memory: "8Gi"
limits:
cpu: "8"
memory: "16Gi"
volumeMounts:
- name: data-storage
mountPath: /data
volumes:
- name: data-storage
persistentVolumeClaim:
claimName: tpch-data-1tb-pvc
restartPolicy: Never
nodeSelector:
kubernetes.io/arch: amd64
backoffLimit: 3