Skip to content

Commit c2dd634

Browse files
committed
upload data to kaggle and hugging face
1 parent 0df9e32 commit c2dd634

File tree

8 files changed

+64
-7
lines changed

8 files changed

+64
-7
lines changed

airflow-setup-test.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ export DB_PASSWORD="airflowpass"
1919

2020
export KAGGLE_USERNAME=$(clear_quotation_marks $(cat $KAGGLE_KEY_FILE | jq .username))
2121
export KAGGLE_KEY=$(clear_quotation_marks $(cat $KAGGLE_KEY_FILE | jq .key))
22+
export KAGGLE_DATASET="dpbmanalysis/quantum-circuit-images"
23+
24+
# HF_TOKEN comes from system
25+
export HF_DATASET="Dpbm/quantum-circuits"
2226

2327
# fix permission for volume
2428
sudo rm -rf ./data

airflow.Dockerfile

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@ RUN ${PIPENV} install -r requirements.txt
88

99

1010
FROM debian:bookworm-slim AS entry
11-
1211
RUN apt update && apt install zip make -y
13-
1412
WORKDIR /
1513
COPY airflow-entrypoint.sh entrypoint.sh
1614
RUN chmod +x entrypoint.sh
@@ -24,7 +22,6 @@ COPY . .
2422

2523
WORKDIR /home/airflow/.local/bin
2624
COPY --from=entry --chown=airflow:root /usr/bin/zip zip
27-
COPY --from=entry --chown=airflow:root /usr/bin/make make
2825

2926
WORKDIR /
3027
COPY --from=entry /entrypoint.sh .

airflow.Dockerfile.dockerignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,5 @@
44
!generate/
55
!utils/
66
!args/
7-
!Makefile
87
!requirements.txt
98
!airflow-entrypoint.sh

compose.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ services:
3838
- USER=${AIRFLOW_USERNAME}
3939
- PASSWORD=${AIRFLOW_PASSWORD}
4040
- EMAIL=${AIRFLOW_EMAIL}
41+
- KAGGLE_USERNAME=${KAGGLE_USERNAME}
42+
- KAGGLE_KEY=${KAGGLE_KEY}
43+
- KAGGLE_DATASET=${KAGGLE_DATASET}
44+
- HF_TOKEN=${HF_TOKEN}
45+
- HF_DATASET=${HF_DATASET}
4146
- PYTHONPATH=/home/airflow/project
4247
- TARGET_FOLDER=/home/airflow/data
4348
- TZ=America/Sao_Paulo

dags/dataset.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
DEFAULT_THREADS,
2525
)
2626
from generate.ghz import gen_circuit
27+
from export.kaggle import upload_dataset as upload_dataset_kaggle
28+
from export.huggingface import upload_dataset as upload_dataset_huggingface
2729

2830
default_args = {
2931
"depends_on_past": True,
@@ -92,7 +94,7 @@
9294
with resized and normalized images.
9395
"""
9496

95-
command = f"zip -r {folder}/dataset-images.zip {folder}/dataset/"
97+
command = f"cd {folder} && zip -r dataset-images.zip dataset/"
9698
pack_img = BashOperator(task_id="pack_images", bash_command=command)
9799

98100
pack_img.doc_md = """
@@ -118,10 +120,31 @@
118120
Run training after finishing all processes.
119121
"""
120122

123+
kaggle_dataset = os.environ.get("KAGGLE_DATASET")
124+
send_kaggle = PythonOperator(
125+
task_id="send_kaggle",
126+
python_callable=upload_dataset_kaggle,
127+
op_args=[kaggle_dataset,folder]
128+
)
129+
130+
hf_dataset = os.environ.get("HF_DATASET")
131+
send_hf = PythonOperator(
132+
task_id="send_huggingface",
133+
python_callable=upload_dataset_huggingface,
134+
op_args=[hf_dataset,folder]
135+
)
136+
137+
send_hf.doc_md = """
138+
Send dataset files to huggingface
139+
"""
140+
121141
create_folder >> [gen_ghz, gen_df]
122142
gen_df >> gen_images
123143
gen_images >> remove_duplicates
124144
remove_duplicates >> transform_img
125145
transform_img >> pack_img
126146

127147
[gen_ghz, pack_img] >> trigger_dag_train
148+
[gen_ghz, pack_img] >> send_kaggle
149+
[gen_ghz, pack_img] >> send_hf
150+

export/huggingface.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
"""Export dataset and model to huggingface"""
2+
3+
import os
4+
5+
from huggingface_hub import HfApi
6+
7+
def upload_dataset(dataset_name:str, folder:str):
8+
api = HfApi(token=os.getenv("HF_TOKEN"))
9+
api.upload_folder(
10+
folder_path=folder,
11+
repo_id=dataset_name,
12+
repo_type="dataset",
13+
ignore_patterns=["dataset/"]
14+
)

export/kaggle.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
11
"""Export dataset and model to kaggle"""
22

3-
# import kagglehub as kh
3+
from time import ctime
4+
5+
import kagglehub as kh
6+
7+
8+
def upload_dataset(dataset_name:str, folder:str):
9+
"""
10+
Upload dataset files to kaggle
11+
"""
12+
13+
version = ctime()
14+
kh.dataset_upload(
15+
dataset_name,
16+
folder,
17+
version_notes=version,
18+
ignore_patterns=["dataset/"])

generate/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def generate_images(
175175
df.vstack(tmp_df, in_place=True)
176176

177177
progress.update(total_threads)
178-
save_df(df, dataset_file_path)
178+
save_df(df, dataset_file_path)
179179

180180

181181
def remove_duplicated_files(target_folder: FilePath):

0 commit comments

Comments
 (0)