Skip to content

Commit b561cf7

Browse files
committed
Refactor CI: separate data branches, remove tracked data files
1 parent 647a4f0 commit b561cf7

File tree

149 files changed

+573
-93504
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+573
-93504
lines changed
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
# Calculate TOTAL2 index and store in orphan 'processed-data' branch
2+
# This workflow is triggered manually from the GitHub Actions UI
3+
name: Calculate TOTAL2
4+
5+
on:
6+
# Manual trigger
7+
workflow_dispatch:
8+
inputs:
9+
index_type:
10+
description: 'Index type (total2 or total2b)'
11+
required: false
12+
default: 'total2b'
13+
type: choice
14+
options:
15+
- total2b
16+
- total2
17+
18+
jobs:
19+
calculate:
20+
runs-on: ubuntu-latest
21+
permissions:
22+
contents: write # Required to push to processed-data branch
23+
24+
steps:
25+
- name: Checkout main branch
26+
uses: actions/checkout@v4
27+
with:
28+
path: main
29+
30+
- name: Checkout raw-data branch
31+
id: checkout-raw-data
32+
uses: actions/checkout@v4
33+
with:
34+
ref: raw-data
35+
path: raw-data-branch
36+
continue-on-error: true
37+
38+
- name: Check raw-data branch exists
39+
if: steps.checkout-raw-data.outcome == 'failure'
40+
run: |
41+
echo "❌ Error: raw-data branch does not exist."
42+
echo "Please run the 'Fetch Raw Data' workflow first."
43+
exit 1
44+
45+
- name: Checkout processed-data branch
46+
id: checkout-processed-data
47+
uses: actions/checkout@v4
48+
with:
49+
ref: processed-data
50+
path: processed-data-branch
51+
continue-on-error: true # Branch may not exist yet
52+
53+
- name: Initialize processed-data branch if missing
54+
if: steps.checkout-processed-data.outcome == 'failure'
55+
run: |
56+
mkdir -p processed-data-branch
57+
cd processed-data-branch
58+
git init
59+
git checkout --orphan processed-data
60+
echo "# Halvix Processed Data" > README.md
61+
echo "" >> README.md
62+
echo "This branch contains processed TOTAL2 index data." >> README.md
63+
echo "It is automatically updated by CI and has no history (orphan branch)." >> README.md
64+
echo "" >> README.md
65+
echo "**Do not manually edit files in this branch.**" >> README.md
66+
git add README.md
67+
git config user.name "github-actions[bot]"
68+
git config user.email "github-actions[bot]@users.noreply.github.com"
69+
git commit -m "Initialize processed-data branch"
70+
echo "Initialized new orphan processed-data branch"
71+
72+
- name: Restore data to main workspace
73+
run: |
74+
mkdir -p main/data/raw
75+
mkdir -p main/data/processed
76+
mkdir -p main/data/cache
77+
78+
# Copy raw data (prices, cache) from raw-data branch
79+
if [ -d "raw-data-branch/raw" ]; then
80+
cp -r raw-data-branch/raw/* main/data/raw/ 2>/dev/null || true
81+
echo "Restored raw data from raw-data branch"
82+
fi
83+
if [ -d "raw-data-branch/cache" ]; then
84+
cp -r raw-data-branch/cache/* main/data/cache/ 2>/dev/null || true
85+
echo "Restored cache from raw-data branch"
86+
fi
87+
88+
# Copy existing processed data (for future incremental calculations)
89+
if [ -d "processed-data-branch/processed" ]; then
90+
cp -r processed-data-branch/processed/* main/data/processed/ 2>/dev/null || true
91+
echo "Restored existing processed data from processed-data branch"
92+
fi
93+
94+
echo "Price files found:"
95+
ls -la main/data/raw/prices/ 2>/dev/null | head -20 || echo "No price files"
96+
97+
- name: Set up Python 3.13
98+
uses: actions/setup-python@v5
99+
with:
100+
python-version: "3.13"
101+
102+
- name: Install Poetry
103+
uses: snok/install-poetry@v1
104+
with:
105+
version: latest
106+
virtualenvs-create: true
107+
virtualenvs-in-project: true
108+
109+
- name: Load cached venv
110+
id: cached-poetry-dependencies
111+
uses: actions/cache@v4
112+
with:
113+
path: main/.venv
114+
key: venv-total2-${{ runner.os }}-3.13-${{ hashFiles('main/**/pyproject.toml') }}
115+
116+
- name: Install dependencies
117+
working-directory: main
118+
run: poetry install --no-interaction
119+
120+
- name: Calculate TOTAL2 index
121+
working-directory: main
122+
run: |
123+
poetry run python -m main calculate-total2 --index-type ${{ github.event.inputs.index_type }}
124+
125+
- name: Show data status
126+
working-directory: main
127+
run: poetry run python -m main status
128+
129+
- name: Update processed-data branch
130+
run: |
131+
cd processed-data-branch
132+
133+
# Configure git
134+
git config user.name "github-actions[bot]"
135+
git config user.email "github-actions[bot]@users.noreply.github.com"
136+
137+
# Remove old data files (but keep README and .git)
138+
find . -maxdepth 1 -type d ! -name '.' ! -name '.git' -exec rm -rf {} + 2>/dev/null || true
139+
140+
# Copy processed data from main workspace
141+
mkdir -p processed
142+
if [ -f "../main/data/processed/total2_index.parquet" ]; then
143+
cp ../main/data/processed/total2_index.parquet processed/
144+
echo "Copied total2_index.parquet"
145+
fi
146+
if [ -f "../main/data/processed/total2_daily_composition.parquet" ]; then
147+
cp ../main/data/processed/total2_daily_composition.parquet processed/
148+
echo "Copied total2_daily_composition.parquet"
149+
fi
150+
if [ -f "../main/data/processed/total2_max_weight_change.json" ]; then
151+
cp ../main/data/processed/total2_max_weight_change.json processed/
152+
echo "Copied total2_max_weight_change.json"
153+
fi
154+
155+
# Update README with timestamp and index type
156+
echo "# Halvix Processed Data" > README.md
157+
echo "" >> README.md
158+
echo "This branch contains processed TOTAL2 index data." >> README.md
159+
echo "It is automatically updated by CI and has no history (orphan branch)." >> README.md
160+
echo "" >> README.md
161+
echo "**Last updated:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> README.md
162+
echo "**Index type:** ${{ github.event.inputs.index_type }}" >> README.md
163+
echo "" >> README.md
164+
echo "**Do not manually edit files in this branch.**" >> README.md
165+
echo "" >> README.md
166+
echo "## Contents" >> README.md
167+
echo "" >> README.md
168+
echo "- \`processed/total2_index.parquet\`: TOTAL2 index time series" >> README.md
169+
echo "- \`processed/total2_daily_composition.parquet\`: Daily coin composition" >> README.md
170+
echo "- \`processed/total2_max_weight_change.json\`: Statistics and metadata" >> README.md
171+
172+
# Show what we're about to commit
173+
echo "=== Files to commit ==="
174+
ls -la
175+
ls -la processed/ 2>/dev/null || true
176+
echo ""
177+
178+
# Stage all changes
179+
git add -A
180+
181+
# Check if there are changes to commit
182+
if git diff --staged --quiet; then
183+
echo "No changes to commit"
184+
exit 0
185+
fi
186+
187+
# Show staged changes summary
188+
echo "=== Staged changes ==="
189+
git diff --staged --stat
190+
echo ""
191+
192+
# Create a single commit (squash history)
193+
if git rev-parse HEAD >/dev/null 2>&1; then
194+
echo "Amending existing commit to squash history..."
195+
git commit --amend -m "Update TOTAL2 index (${{ github.event.inputs.index_type }}) - $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
196+
else
197+
echo "Creating initial commit..."
198+
git commit -m "Update TOTAL2 index (${{ github.event.inputs.index_type }}) - $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
199+
fi
200+
201+
# Force push to keep orphan branch with single commit
202+
git remote set-url origin "https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git" 2>/dev/null || \
203+
git remote add origin "https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git" 2>/dev/null || true
204+
205+
echo "Pushing to processed-data branch..."
206+
git push --force origin processed-data
207+
208+
echo "=== Processed data branch updated successfully ==="
Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# Fetch cryptocurrency data and store in orphan 'data' branch
1+
# Fetch cryptocurrency price data and store in orphan 'raw-data' branch
22
# This workflow is triggered manually from the GitHub Actions UI
3-
name: Fetch Data
3+
name: Fetch Raw Data
44

55
on:
66
# Manual trigger only - this is a heavy workflow
@@ -15,54 +15,57 @@ jobs:
1515
fetch-data:
1616
runs-on: ubuntu-latest
1717
permissions:
18-
contents: write # Required to push to data branch
18+
contents: write # Required to push to raw-data branch
1919

2020
steps:
2121
- name: Checkout main branch
2222
uses: actions/checkout@v4
2323
with:
2424
path: main
2525

26-
- name: Checkout data branch
27-
id: checkout-data
26+
- name: Checkout raw-data branch
27+
id: checkout-raw-data
2828
uses: actions/checkout@v4
2929
with:
30-
ref: data
31-
path: data-branch
30+
ref: raw-data
31+
path: raw-data-branch
3232
continue-on-error: true # Branch may not exist yet
3333

34-
- name: Initialize data branch if missing
35-
if: steps.checkout-data.outcome == 'failure'
34+
- name: Initialize raw-data branch if missing
35+
if: steps.checkout-raw-data.outcome == 'failure'
3636
run: |
37-
mkdir -p data-branch
38-
cd data-branch
37+
mkdir -p raw-data-branch
38+
cd raw-data-branch
3939
git init
40-
git checkout --orphan data
41-
echo "# Halvix Data Cache" > README.md
40+
git checkout --orphan raw-data
41+
echo "# Halvix Raw Data" > README.md
4242
echo "" >> README.md
43-
echo "This branch contains cached price data (parquet files)." >> README.md
43+
echo "This branch contains raw price data (parquet files)." >> README.md
4444
echo "It is automatically updated by CI and has no history (orphan branch)." >> README.md
4545
echo "" >> README.md
4646
echo "**Do not manually edit files in this branch.**" >> README.md
4747
git add README.md
4848
git config user.name "github-actions[bot]"
4949
git config user.email "github-actions[bot]@users.noreply.github.com"
50-
git commit -m "Initialize data branch"
51-
echo "Initialized new orphan data branch"
50+
git commit -m "Initialize raw-data branch"
51+
echo "Initialized new orphan raw-data branch"
5252
5353
- name: Restore cached data to main workspace
5454
run: |
55-
# Copy existing data from data branch to main workspace
56-
if [ -d "data-branch/raw" ]; then
55+
# Copy existing data from raw-data branch to main workspace
56+
if [ -d "raw-data-branch/raw" ]; then
5757
mkdir -p main/data
58-
cp -r data-branch/raw main/data/ 2>/dev/null || true
59-
cp -r data-branch/processed main/data/ 2>/dev/null || true
60-
cp -r data-branch/cache main/data/ 2>/dev/null || true
61-
echo "Restored cached data from data branch"
58+
cp -r raw-data-branch/raw main/data/ 2>/dev/null || true
59+
cp -r raw-data-branch/cache main/data/ 2>/dev/null || true
60+
# Copy coins_to_download.json and download_skipped.csv
61+
mkdir -p main/data/processed
62+
cp raw-data-branch/processed/coins_to_download.json main/data/processed/ 2>/dev/null || true
63+
cp raw-data-branch/processed/download_skipped.csv main/data/processed/ 2>/dev/null || true
64+
echo "Restored cached data from raw-data branch"
6265
echo "Price files found:"
6366
ls -la main/data/raw/prices/ 2>/dev/null | head -20 || echo "No price files yet"
6467
else
65-
echo "No existing data in data branch (first run)"
68+
echo "No existing data in raw-data branch (first run)"
6669
mkdir -p main/data/raw/prices
6770
mkdir -p main/data/processed
6871
mkdir -p main/data/cache
@@ -108,9 +111,9 @@ jobs:
108111
working-directory: main
109112
run: poetry run python -m main status
110113

111-
- name: Update data branch with new data
114+
- name: Update raw-data branch with new data
112115
run: |
113-
cd data-branch
116+
cd raw-data-branch
114117
115118
# Configure git
116119
git config user.name "github-actions[bot]"
@@ -119,24 +122,28 @@ jobs:
119122
# Remove old data files (but keep README and .git)
120123
find . -maxdepth 1 -type d ! -name '.' ! -name '.git' -exec rm -rf {} + 2>/dev/null || true
121124
122-
# Copy new data from main workspace
125+
# Copy raw data from main workspace
123126
if [ -d "../main/data/raw" ]; then
124127
cp -r ../main/data/raw . 2>/dev/null || true
125128
echo "Copied raw data"
126129
fi
127-
if [ -d "../main/data/processed" ]; then
128-
cp -r ../main/data/processed . 2>/dev/null || true
129-
echo "Copied processed data"
130-
fi
131130
if [ -d "../main/data/cache" ]; then
132131
cp -r ../main/data/cache . 2>/dev/null || true
133132
echo "Copied cache data"
134133
fi
134+
# Copy only specific processed files (coins_to_download.json, download_skipped.csv)
135+
mkdir -p processed
136+
if [ -f "../main/data/processed/coins_to_download.json" ]; then
137+
cp ../main/data/processed/coins_to_download.json processed/ 2>/dev/null || true
138+
fi
139+
if [ -f "../main/data/processed/download_skipped.csv" ]; then
140+
cp ../main/data/processed/download_skipped.csv processed/ 2>/dev/null || true
141+
fi
135142
136143
# Update README with timestamp
137-
echo "# Halvix Data Cache" > README.md
144+
echo "# Halvix Raw Data" > README.md
138145
echo "" >> README.md
139-
echo "This branch contains cached price data (parquet files)." >> README.md
146+
echo "This branch contains raw price data (parquet files)." >> README.md
140147
echo "It is automatically updated by CI and has no history (orphan branch)." >> README.md
141148
echo "" >> README.md
142149
echo "**Last updated:** $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> README.md
@@ -150,8 +157,11 @@ jobs:
150157
count=$(ls raw/prices/*.parquet 2>/dev/null | wc -l)
151158
echo "- \`raw/prices/\`: $count coin price files" >> README.md
152159
fi
160+
if [ -d "cache" ]; then
161+
echo "- \`cache/\`: Coin list cache" >> README.md
162+
fi
153163
if [ -d "processed" ]; then
154-
echo "- \`processed/\`: Processed data files (accepted coins, rejected coins, TOTAL2 index)" >> README.md
164+
echo "- \`processed/\`: Download metadata (coins_to_download.json, download_skipped.csv)" >> README.md
155165
fi
156166
157167
# Show what we're about to commit
@@ -174,20 +184,19 @@ jobs:
174184
echo ""
175185
176186
# Create a single commit (squash history by amending or resetting)
177-
# Using --amend to keep only one commit in the orphan branch
178187
if git rev-parse HEAD >/dev/null 2>&1; then
179188
echo "Amending existing commit to squash history..."
180-
git commit --amend -m "Update data cache - $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
189+
git commit --amend -m "Update raw data - $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
181190
else
182191
echo "Creating initial commit..."
183-
git commit -m "Update data cache - $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
192+
git commit -m "Update raw data - $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
184193
fi
185194
186195
# Force push to keep orphan branch with single commit
187196
git remote set-url origin "https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git" 2>/dev/null || \
188197
git remote add origin "https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git" 2>/dev/null || true
189198
190-
echo "Pushing to data branch..."
191-
git push --force origin data
199+
echo "Pushing to raw-data branch..."
200+
git push --force origin raw-data
192201
193-
echo "=== Data branch updated successfully ==="
202+
echo "=== Raw data branch updated successfully ==="

0 commit comments

Comments
 (0)