-
Notifications
You must be signed in to change notification settings - Fork 468
164 lines (143 loc) · 5.57 KB
/
detect-new-data.yml
File metadata and controls
164 lines (143 loc) · 5.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# This workflow automatically detects new data items (feeds, smart data, and streams)
# It runs weekly on Sunday at 9 PM UTC and can also be triggered manually
name: Detect New Data (Feeds/SmartData/Streams)
on:
schedule:
- cron: '0 21 * * 0' # Sunday 9 PM UTC
workflow_dispatch: # Allow manual triggering
# Prevent multiple workflow runs from executing simultaneously
# If a new workflow is triggered, any in-progress runs will be cancelled
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
# Define required permissions for the workflow
permissions:
contents: write # needed to commit changes
pull-requests: write # needed to open PR
issues: write # needed to create issues
# Set environment variables used across jobs
env:
NODE_VERSION: '20'
jobs:
# First job: Sets up the environment and installs dependencies
setup:
name: Setup Dependencies
runs-on: ubuntu-latest
steps:
# Step 1: Check out the repository code
- name: Checkout Repo
uses: actions/checkout@v6
with:
fetch-depth: 0 # Fetch all history for git operations
# Step 2: Set up Node.js environment
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
cache-dependency-path: '**/package-lock.json'
# Step 3: Cache node_modules for faster future runs
- name: Cache node_modules
uses: actions/cache@v4
id: cache-node-modules
with:
path: |
node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('**/package-lock.json') }}
# Step 4: Install dependencies if cache miss occurs
- name: Install Dependencies
if: steps.cache-node-modules.outputs.cache-hit != 'true'
run: |
echo "Cache miss, installing dependencies..."
npm ci --prefer-offline --no-audit
# Second job: Main job to detect new data items and validate URLs
detect-new-data:
needs: setup
name: Check for New Data items
runs-on: ubuntu-latest
steps:
# Step 1: Check out repository code
- name: Checkout Repo
uses: actions/checkout@v6
with:
fetch-depth: 0 # Fetch all history for git operations
# Step 2: Set up Node.js environment
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
cache-dependency-path: '**/package-lock.json'
# Step 3: Restore cached node_modules from setup job
- name: Restore node_modules
uses: actions/cache@v4
with:
path: |
node_modules
key: ${{ runner.os }}-node-modules-${{ hashFiles('**/package-lock.json') }}
restore-keys: |
${{ runner.os }}-node-modules-
# Step 4: Make the detection script executable
- name: Ensure detect-data.sh is executable
run: chmod +x .github/scripts/data/detect-data.sh
# Step 5: Make the validate-urls.sh script executable
- name: Ensure validate-urls.sh is executable
run: chmod +x .github/scripts/data/validate-urls.sh
# Step 6: Run the data detection script to check for new data items
- name: Run data detection
id: detect_data
run: |
.github/scripts/data/detect-data.sh check-data
# If we see temp/NEW_DATA_FOUND.json afterwards, we know new items were found:
if [ -f temp/NEW_DATA_FOUND.json ]; then
echo "new_data_found=true" >> $GITHUB_OUTPUT
else
echo "new_data_found=false" >> $GITHUB_OUTPUT
fi
continue-on-error: true
# Step 7: Validate all URLs in the data
- name: Validate URLs
id: validate_urls
run: .github/scripts/data/validate-urls.sh
# Step 8: Clean up temporary files
- name: Cleanup NEW_DATA_FOUND
run: rm -f temp/NEW_DATA_FOUND.json
# Step 9: Create an issue if any invalid URLs are detected
- name: Create Issue for Broken URLs
if: steps.validate_urls.outputs.url_validation_failed == 'true'
uses: peter-evans/create-issue-from-file@fca9117c27cdc29c6c4db3b86c48e4115a786710 # v6.0.0
with:
title: "GHA-data-validate-urls: Invalid URLs Detected"
content-filepath: "temp/url_validation_report.md"
labels: |
bug
gh-action-data
assignees: khadni
# Step 10: Create a pull request if new data items were found
- name: Commit & Create Pull Request
if: steps.detect_data.outputs.new_data_found == 'true'
uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
with:
token: ${{ secrets.GITHUB_TOKEN }}
commit-message: "Data: Update changelog for new data items"
branch: "data-update-${{ github.run_id }}"
title: "Changelog Update: New data found"
body: "This PR was automatically created."
add-paths: |
.github/scripts/data/baseline.json
public/changelog.json
labels: |
automation
assignees: khadni
# Step 11: Upload artifacts for debugging and record-keeping
- name: Capture output artifacts
if: always()
uses: actions/upload-artifact@v5
with:
name: data-detection-${{ github.run_id }}
path: |
temp/*.json
temp/*.md
public/changelog.json
retention-days: 14
if-no-files-found: ignore