Skip to content

Commit a57111a

Browse files
committed
fixed dataset load
1 parent 6d0ed79 commit a57111a

File tree

12 files changed

+101
-32
lines changed

12 files changed

+101
-32
lines changed

airflow-setup-test.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,25 @@
11
#!/bin/bash
22

3+
KAGGLE_KEY_FILE="$HOME/.kaggle/kaggle.json"
4+
5+
clear_quotation_marks(){
6+
STRING=$1
7+
echo $STRING | sed 's/"//g'
8+
}
9+
310
export AIRFLOW_USERNAME="test"
411
export AIRFLOW_PASSWORD="test"
512
export AIRFLOW_EMAIL="[email protected]"
13+
614
export ROOT_DB="rootuser"
715
export ROOT_PASS="rootpass"
816
export DB_NAME="airflow"
917
export DB_USERNAME="airflowuser"
1018
export DB_PASSWORD="airflowpass"
1119

20+
export KAGGLE_USERNAME=$(clear_quotation_marks $(cat $KAGGLE_KEY_FILE | jq .username))
21+
export KAGGLE_KEY=$(clear_quotation_marks $(cat $KAGGLE_KEY_FILE | jq .key))
22+
1223
# fix permission for volume
1324
sudo rm -rf ./data
1425
sudo mkdir -p ./data

airflow.Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ ENV PIPENV="/proj-venv/bin/pip"
77

88
RUN ${PIPENV} install -r requirements.txt
99

10-
11-
1210
FROM apache/airflow:slim-2.11.0-python3.12 AS serve
1311

1412
COPY --from=setup /proj-venv/lib/python3.12/site-packages/ /home/airflow/.local/lib/python3.12/site-packages
@@ -28,5 +26,7 @@ RUN airflow users create --username $USER \
2826
--email ${EMAIL} \
2927
--password ${PASSWORD}
3028

29+
RUN airflow scheduler &
30+
3131
EXPOSE 8080
32-
ENTRYPOINT [ "airflow", "standalone" ]
32+
ENTRYPOINT [ "airflow", "webserver", "--port", "8080" ]
File renamed without changes.

compose.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,5 @@ services:
4040
- AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS=False
4141
- AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgres+psycopg2://${DB_USERNAME}:${DB_PASSWORD}@db:5432/${DB_NAME}
4242
- AIRFLOW_CONN_METADATA_DB=postgres+psycopg2://${DB_USERNAME}:${DB_PASSWORD}@db:5432/${DB_NAME}
43-
restart: unless-stopped
43+
restart: always
44+

constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,6 @@
3030

3131
NEW_DIM = (500, 500)
3232

33+
TRAIN_PERCENTAGE = 0.7
34+
TEST_PERCENTAGE = 0.2
35+
EVAL_PERCENTAGE = 1 - (TRAIN_PERCENTAGE + TEST_PERCENTAGE)

dags/dataset.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from airflow import DAG
44
from airflow.providers.standard.operators.bash import BashOperator
55
from airflow.providers.standard.operators.python import PythonOperator
6+
from airflow.providers.standard.operators.trigger_dagrun import TriggerDagRunOperator
67

78
from dataset import generate_images, crate_dataset_folder, remove_duplicated_files, transform_images, start_df
89
from constants import DATASET_PATH, DATASET_FILE
@@ -86,9 +87,21 @@
8687
Generate a GHZ experiment and saves the experiments results.
8788
"""
8889

90+
trigger_dag_train = TriggerDagRunOperator(
91+
task_id="run_training",
92+
trigger_dag_id="train_model",
93+
wait_for_completion=False
94+
)
95+
96+
trigger_dag_train.doc_md = """
97+
Run training after finishing all processes.
98+
"""
99+
89100
create_folder >> [gen_ghz, gen_df]
90101
gen_df >> gen_images
91102
gen_images >> remove_duplicates
92103
remove_duplicates >> transform_img
93104
transform_img >> pack_img
94105

106+
[gen_ghz, pack_img] >> trigger_dag_train
107+

dags/train.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
"""ETL pipeline using Airflow for training"""
22

3-
from datetime import timedelta
4-
53
from airflow import DAG
6-
from airflow.sensors.external_task import ExternalTaskSensor
74
from airflow.providers.standard.operators.python import PythonOperator
85

96
from train import setup_and_run_training
@@ -13,14 +10,6 @@
1310
description="train vision model"
1411
) as dag:
1512

16-
wait_for_dataset_creation = ExternalTaskSensor(
17-
task_id="wait_dataset_creation",
18-
external_dag_id="build_dataset",
19-
external_task_id=None, # Set to None to wait for the whole DAG
20-
mode="poke",
21-
)
22-
23-
2413
train = PythonOperator(
2514
task_id="train_model",
2615
python_callable=setup_and_run_training,
@@ -29,5 +18,3 @@
2918
train.doc_md = """
3019
Run the training cycle
3120
"""
32-
33-
wait_for_dataset_creation >> train

export/__init__.py

Whitespace-only changes.

export/huggingface.py

Whitespace-only changes.

export/kaggle.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"""Export dataset and model to kaggle"""
2+
import kagglehub as kh

0 commit comments

Comments
 (0)