-
Notifications
You must be signed in to change notification settings - Fork 14
159 lines (133 loc) · 5.09 KB
/
Copy pathopendata.yml
File metadata and controls
159 lines (133 loc) · 5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
name: Generate Open Data
on:
workflow_dispatch:
schedule:
- cron: '24 16 * * 0'
jobs:
generate-opendata:
runs-on: ubuntu-latest
outputs:
snapshot: ${{ steps.find-snapshot.outputs.snapshot }}
services:
elasticsearch:
image: elasticsearch:9.3.2
ports:
- 62223:9200
env:
discovery.type: single-node
xpack.security.enabled: 'false'
options: >-
--health-cmd "curl http://localhost:9200/_cat/health"
--health-interval 10s
--health-timeout 5s
--health-retries 10
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '24'
cache: 'npm'
- name: Prepare directory and secrets
run: |
mkdir -p esdata
echo "$GCS_CREDENTIALS" > esdata/gcs.json
chmod 644 esdata/gcs.json
env:
GCS_CREDENTIALS: ${{ secrets.GCS_CREDENTIALS }}
- name: Configure GCS Keystore
run: |
# Find the container ID
CONTAINER_ID=$(docker ps --filter "ancestor=elasticsearch:9.3.2" --format "{{.ID}}")
echo "Elasticsearch container ID: $CONTAINER_ID"
if [ -z "$CONTAINER_ID" ]; then
echo "Error: Elasticsearch container not found"
exit 1
fi
# Add credentials to keystore
# Copy credentials file into container
docker cp esdata/gcs.json $CONTAINER_ID:/usr/share/elasticsearch/config/gcs.json
# Add to keystore
docker exec $CONTAINER_ID bin/elasticsearch-keystore add-file gcs.client.default.credentials_file /usr/share/elasticsearch/config/gcs.json
# Clean up credentials file inside container
docker exec $CONTAINER_ID rm /usr/share/elasticsearch/config/gcs.json
# Restart container to reload keystore
docker restart $CONTAINER_ID
- name: Wait for Elasticsearch
run: |
timeout 60 bash -c 'until curl -s http://localhost:62223/_cat/health > /dev/null; do sleep 5; done'
- name: Register GCS Repository
run: |
curl -s -X PUT "http://localhost:62223/_snapshot/cofacts" -H 'Content-Type: application/json' -d '{
"type": "gcs",
"settings": {
"bucket": "cofacts-db-snapshots",
"base_path": "v9",
"readonly": true
}
}'
- name: Find Latest Snapshot
id: find-snapshot
run: |
SNAPSHOTS=$(curl -s "http://localhost:62223/_snapshot/cofacts/_all?verbose=true")
# Sort by start_time_in_millis and get the last one
LATEST_SNAPSHOT=$(echo "$SNAPSHOTS" | jq -r '.snapshots | sort_by(.start_time_in_millis) | last | .snapshot')
echo "Latest snapshot: $LATEST_SNAPSHOT"
echo "snapshot=$LATEST_SNAPSHOT" >> $GITHUB_OUTPUT
- name: Restore Snapshot
run: |
SNAPSHOT_NAME=${{ steps.find-snapshot.outputs.snapshot }}
if [ -z "$SNAPSHOT_NAME" ] || [ "$SNAPSHOT_NAME" == "null" ]; then
echo "Error: No snapshot found or failed to parse"
exit 1
fi
echo "Restoring $SNAPSHOT_NAME..."
# Delete all indices first
curl -s -X DELETE "http://localhost:62223/_all"
# Restore with wait_for_completion=true
curl -s -X POST "http://localhost:62223/_snapshot/cofacts/$SNAPSHOT_NAME/_restore?wait_for_completion=true" -H 'Content-Type: application/json' -d '{
"indices": "*,-urls*"
}'
- name: Install dependencies
run: npm ci
- name: Generate CSVs
run: npm start
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
name: opendata-csvs
path: data/*.zip
update-readme:
needs: generate-opendata
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- uses: actions/checkout@v4
- name: Update README.md
run: |
sed -i "s/<!-- SNAPSHOT_DATE -->.*<!-- \/SNAPSHOT_DATE -->/<!-- SNAPSHOT_DATE -->${{ needs.generate-opendata.outputs.snapshot }}<!-- \/SNAPSHOT_DATE -->/g" README.md
- name: Commit and push changes
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git diff --quiet || git commit -am "chore: update latest snapshot date to ${{ needs.generate-opendata.outputs.snapshot }}"
git push
upload-dataset:
needs: generate-opendata
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v4
with:
name: opendata-csvs
path: data
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: false
- name: Upload to Hugging Face
run: >-
uvx --from huggingface_hub hf upload Cofacts/line-msg-fact-check-tw ./data .
--repo-type dataset
--commit-message "${{ needs.generate-opendata.outputs.snapshot }} update"
--token ${{ secrets.HF_TOKEN }}