Generate Open Data #17
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Generate Open Data | |
| on: | |
| workflow_dispatch: | |
| schedule: | |
| - cron: '24 16 * * 0' | |
| jobs: | |
| generate-opendata: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| snapshot: ${{ steps.find-snapshot.outputs.snapshot }} | |
| services: | |
| elasticsearch: | |
| image: elasticsearch:9.3.2 | |
| ports: | |
| - 62223:9200 | |
| env: | |
| discovery.type: single-node | |
| xpack.security.enabled: 'false' | |
| options: >- | |
| --health-cmd "curl http://localhost:9200/_cat/health" | |
| --health-interval 10s | |
| --health-timeout 5s | |
| --health-retries 10 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '24' | |
| cache: 'npm' | |
| - name: Prepare directory and secrets | |
| run: | | |
| mkdir -p esdata | |
| echo "$GCS_CREDENTIALS" > esdata/gcs.json | |
| chmod 644 esdata/gcs.json | |
| env: | |
| GCS_CREDENTIALS: ${{ secrets.GCS_CREDENTIALS }} | |
| - name: Configure GCS Keystore | |
| run: | | |
| # Find the container ID | |
| CONTAINER_ID=$(docker ps --filter "ancestor=elasticsearch:9.3.2" --format "{{.ID}}") | |
| echo "Elasticsearch container ID: $CONTAINER_ID" | |
| if [ -z "$CONTAINER_ID" ]; then | |
| echo "Error: Elasticsearch container not found" | |
| exit 1 | |
| fi | |
| # Add credentials to keystore | |
| # Copy credentials file into container | |
| docker cp esdata/gcs.json $CONTAINER_ID:/usr/share/elasticsearch/config/gcs.json | |
| # Add to keystore | |
| docker exec $CONTAINER_ID bin/elasticsearch-keystore add-file gcs.client.default.credentials_file /usr/share/elasticsearch/config/gcs.json | |
| # Clean up credentials file inside container | |
| docker exec $CONTAINER_ID rm /usr/share/elasticsearch/config/gcs.json | |
| # Restart container to reload keystore | |
| docker restart $CONTAINER_ID | |
| - name: Wait for Elasticsearch | |
| run: | | |
| timeout 60 bash -c 'until curl -s http://localhost:62223/_cat/health > /dev/null; do sleep 5; done' | |
| - name: Register GCS Repository | |
| run: | | |
| curl -s -X PUT "http://localhost:62223/_snapshot/cofacts" -H 'Content-Type: application/json' -d '{ | |
| "type": "gcs", | |
| "settings": { | |
| "bucket": "cofacts-db-snapshots", | |
| "base_path": "v9", | |
| "readonly": true | |
| } | |
| }' | |
| - name: Find Latest Snapshot | |
| id: find-snapshot | |
| run: | | |
| SNAPSHOTS=$(curl -s "http://localhost:62223/_snapshot/cofacts/_all?verbose=true") | |
| # Sort by start_time_in_millis and get the last one | |
| LATEST_SNAPSHOT=$(echo "$SNAPSHOTS" | jq -r '.snapshots | sort_by(.start_time_in_millis) | last | .snapshot') | |
| echo "Latest snapshot: $LATEST_SNAPSHOT" | |
| echo "snapshot=$LATEST_SNAPSHOT" >> $GITHUB_OUTPUT | |
| - name: Restore Snapshot | |
| run: | | |
| SNAPSHOT_NAME=${{ steps.find-snapshot.outputs.snapshot }} | |
| if [ -z "$SNAPSHOT_NAME" ] || [ "$SNAPSHOT_NAME" == "null" ]; then | |
| echo "Error: No snapshot found or failed to parse" | |
| exit 1 | |
| fi | |
| echo "Restoring $SNAPSHOT_NAME..." | |
| # Delete all indices first | |
| curl -s -X DELETE "http://localhost:62223/_all" | |
| # Restore with wait_for_completion=true | |
| curl -s -X POST "http://localhost:62223/_snapshot/cofacts/$SNAPSHOT_NAME/_restore?wait_for_completion=true" -H 'Content-Type: application/json' -d '{ | |
| "indices": "*,-urls*" | |
| }' | |
| - name: Install dependencies | |
| run: npm ci | |
| - name: Generate CSVs | |
| run: npm start | |
| - name: Upload Artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: opendata-csvs | |
| path: data/*.zip | |
| update-readme: | |
| needs: generate-opendata | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: write | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Update README.md | |
| run: | | |
| sed -i "s/<!-- SNAPSHOT_DATE -->.*<!-- \/SNAPSHOT_DATE -->/<!-- SNAPSHOT_DATE -->${{ needs.generate-opendata.outputs.snapshot }}<!-- \/SNAPSHOT_DATE -->/g" README.md | |
| - name: Commit and push changes | |
| run: | | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git diff --quiet || git commit -am "chore: update latest snapshot date to ${{ needs.generate-opendata.outputs.snapshot }}" | |
| git push | |
| upload-dataset: | |
| needs: generate-opendata | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/download-artifact@v4 | |
| with: | |
| name: opendata-csvs | |
| path: data | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v5 | |
| with: | |
| enable-cache: false | |
| - name: Upload to Hugging Face | |
| run: >- | |
| uvx --from huggingface_hub hf upload Cofacts/line-msg-fact-check-tw ./data . | |
| --repo-type dataset | |
| --commit-message "${{ needs.generate-opendata.outputs.snapshot }} update" | |
| --token ${{ secrets.HF_TOKEN }} |