Skip to content

Commit d8c366a

Browse files
committed
upload data to kaggle and hugging face
1 parent c2dd634 commit d8c366a

File tree

9 files changed

+134
-11
lines changed

9 files changed

+134
-11
lines changed

airflow-setup-test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,11 @@ export DB_PASSWORD="airflowpass"
2020
export KAGGLE_USERNAME=$(clear_quotation_marks $(cat $KAGGLE_KEY_FILE | jq .username))
2121
export KAGGLE_KEY=$(clear_quotation_marks $(cat $KAGGLE_KEY_FILE | jq .key))
2222
export KAGGLE_DATASET="dpbmanalysis/quantum-circuit-images"
23+
export KAGGLE_MODEL="dpbmanalysis/qcop/pyTorch/standard"
2324

2425
# HF_TOKEN comes from system
2526
export HF_DATASET="Dpbm/quantum-circuits"
27+
export HF_MODEL_REPO="Dpbm/qcop"
2628

2729
# fix permission for volume
2830
sudo rm -rf ./data

compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,10 @@ services:
4141
- KAGGLE_USERNAME=${KAGGLE_USERNAME}
4242
- KAGGLE_KEY=${KAGGLE_KEY}
4343
- KAGGLE_DATASET=${KAGGLE_DATASET}
44+
- KAGGLE_MODEL=${KAGGLE_MODEL}
4445
- HF_TOKEN=${HF_TOKEN}
4546
- HF_DATASET=${HF_DATASET}
47+
- HF_MODEL_REPO=${HF_MODEL_REPO}
4648
- PYTHONPATH=/home/airflow/project
4749
- TARGET_FOLDER=/home/airflow/data
4850
- TZ=America/Sao_Paulo

dags/dataset.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,18 +120,16 @@
120120
Run training after finishing all processes.
121121
"""
122122

123-
kaggle_dataset = os.environ.get("KAGGLE_DATASET")
124123
send_kaggle = PythonOperator(
125124
task_id="send_kaggle",
126125
python_callable=upload_dataset_kaggle,
127-
op_args=[kaggle_dataset,folder]
126+
op_args=[folder]
128127
)
129128

130-
hf_dataset = os.environ.get("HF_DATASET")
131129
send_hf = PythonOperator(
132130
task_id="send_huggingface",
133131
python_callable=upload_dataset_huggingface,
134-
op_args=[hf_dataset,folder]
132+
op_args=[folder]
135133
)
136134

137135
send_hf.doc_md = """

dags/train.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from train import setup_and_run_training
99
from args.parser import Arguments
1010
from utils.constants import DEFAULT_TARGET_FOLDER
11+
from export.kaggle import upload_model as upload_model_kaggle
12+
from export.huggingface import upload_model as upload_model_hf
1113

1214
with DAG(dag_id="train_model", description="train vision model") as dag:
1315
# the env variable is meant to ease the docker image usage
@@ -16,9 +18,34 @@
1618
args.target_folder = folder
1719

1820
train = PythonOperator(
19-
task_id="train_model", python_callable=setup_and_run_training, op_args=[args]
21+
task_id="train_model",
22+
python_callable=setup_and_run_training,
23+
op_args=[args]
2024
)
2125

2226
train.doc_md = """
2327
Run the training cycle
2428
"""
29+
30+
upload_kaggle = PythonOperator(
31+
task_id="upload_kaggle",
32+
python_callable=upload_model_kaggle,
33+
op_args=[folder]
34+
)
35+
36+
upload_kaggle.doc_md = """
37+
Send model file to kaggle
38+
"""
39+
40+
upload_hf = PythonOperator(
41+
task_id="upload_hugginface",
42+
python_callable=upload_model_hf,
43+
op_args=[folder]
44+
)
45+
46+
upload_hf.doc_md = """
47+
Send model file to huggingface
48+
"""
49+
50+
train >> upload_kaggle
51+
train >> upload_hf

export/helpers.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Helpers for exporting data"""
2+
3+
import os
4+
5+
from utils.constants import MODEL_FILE_PREFIX
6+
from utils.datatypes import FilePath
7+
8+
def get_latest_model(folder:FilePath) -> FilePath:
9+
"""
10+
Check between model files which was the latest modified (the latest model)
11+
"""
12+
13+
model_files = []
14+
for file in os.listdir(folder):
15+
if not file.startswith(MODEL_FILE_PREFIX):
16+
continue
17+
model_files.append(file)
18+
19+
get_file_mod_time = lambda file: os.path.getmtime(os.path.join(folder, file)) # noqa: E731
20+
model_files.sort(key=get_file_mod_time, reverse=True)
21+
return model_files[0]
22+
23+
24+
25+

export/huggingface.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,51 @@
11
"""Export dataset and model to huggingface"""
22

33
import os
4+
import argparse
5+
import sys
46

57
from huggingface_hub import HfApi
68

7-
def upload_dataset(dataset_name:str, folder:str):
9+
from utils.datatypes import FilePath
10+
from export.helpers import get_latest_model
11+
12+
def upload_dataset(folder:FilePath):
13+
"""
14+
Upload dataset to huggingface
15+
"""
16+
817
api = HfApi(token=os.getenv("HF_TOKEN"))
18+
dataset_name = str(os.getenv("HF_DATASET"))
19+
920
api.upload_folder(
1021
folder_path=folder,
1122
repo_id=dataset_name,
1223
repo_type="dataset",
1324
ignore_patterns=["dataset/"]
14-
)
25+
)
26+
27+
def upload_model(folder:str):
28+
"""
29+
Get model file and upload it to huggingface
30+
"""
31+
32+
latest_model = get_latest_model(folder)
33+
api = HfApi(token=os.getenv("HF_TOKEN"))
34+
model_name = str(os.getenv("HF_MODEL_REPO"))
35+
api.upload_file(
36+
path_or_fileobj=os.path.join(folder,latest_model),
37+
path_in_repo=latest_model,
38+
repo_id=model_name,
39+
repo_type="model",
40+
)
41+
42+
43+
44+
if __name__ == "__main__":
45+
parser = argparse.ArgumentParser()
46+
parser.add_argument("path", type=str)
47+
args = parser.parse_args(sys.argv[1:])
48+
49+
upload_dataset(args.path)
50+
upload_model(args.path)
51+

export/kaggle.py

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,47 @@
11
"""Export dataset and model to kaggle"""
22

3+
import os
4+
import sys
35
from time import ctime
6+
import argparse
47

58
import kagglehub as kh
69

10+
from utils.datatypes import FilePath
711

8-
def upload_dataset(dataset_name:str, folder:str):
12+
def upload_dataset(folder:FilePath):
913
"""
1014
Upload dataset files to kaggle
1115
"""
12-
1316
version = ctime()
17+
dataset_name = str(os.getenv("KAGGLE_DATASET"))
18+
1419
kh.dataset_upload(
1520
dataset_name,
1621
folder,
1722
version_notes=version,
18-
ignore_patterns=["dataset/"])
23+
ignore_patterns=["dataset/"]
24+
)
25+
26+
def upload_model(folder:str):
27+
"""
28+
Get model file and upload it to kaggle
29+
"""
30+
31+
version = ctime()
32+
model_name = str(os.getenv("KAGGLE_MODEL"))
33+
34+
kh.model_upload(
35+
handle=model_name,
36+
local_model_dir=folder,
37+
version_notes=version,
38+
ignore_patterns=["dataset/","ghz*","*.zip","*.h5","*.csv"]
39+
)
40+
41+
if __name__ == "__main__":
42+
parser = argparse.ArgumentParser()
43+
parser.add_argument("path", type=str)
44+
args = parser.parse_args(sys.argv[1:])
45+
46+
upload_dataset(args.path)
47+
upload_model(args.path)

train.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from utils.constants import (
2020
DEBUG,
21+
MODEL_FILE_PREFIX,
2122
dataset_file,
2223
images_h5_file,
2324
ghz_file,
@@ -208,7 +209,7 @@ def forward(self, image: torch.Tensor) -> torch.Tensor:
208209

209210
def save(self):
210211
"""Save model weights."""
211-
path = "model_%s" % (time.ctime())
212+
path = "%s%s" % (MODEL_FILE_PREFIX, time.ctime())
212213
torch.save(self.state_dict(), path)
213214

214215

utils/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525

2626
DEFAULT_CHECKPOINT = None
2727

28+
MODEL_FILE_PREFIX = "model_"
29+
2830
# ruff: noqa: E731
2931
dataset_path = lambda target_folder: os.path.join(target_folder, "dataset")
3032
dataset_file = lambda target_folder: os.path.join(target_folder, "dataset.csv")

0 commit comments

Comments
 (0)