-
Notifications
You must be signed in to change notification settings - Fork 1
148 lines (131 loc) · 5.12 KB
/
validate-datasets.yml
File metadata and controls
148 lines (131 loc) · 5.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
name: Validate Datasets
on:
pull_request:
paths:
- 'datasets/**'
- 'scripts/validate.py'
- '.github/workflows/validate-datasets.yml'
push:
branches:
- master
paths:
- 'datasets/**'
- 'scripts/validate.py'
jobs:
validate:
runs-on: ubuntu-latest
permissions:
contents: read
issues: write
pull-requests: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for proper diff
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Get changed datasets
id: changed-datasets
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
# For PRs, compare against the base branch
git fetch origin "${{ github.base_ref }}" --depth=1
git diff --name-only origin/${{ github.base_ref }}...HEAD | grep "^datasets/" | cut -d'/' -f1-2 | sort -u > changed_datasets.txt
else
# For pushes to master, check the last commit
git diff --name-only HEAD~1 HEAD | grep "^datasets/" | cut -d'/' -f1-2 | sort -u > changed_datasets.txt
fi
if [ -s changed_datasets.txt ]; then
echo "Changed datasets:"
cat changed_datasets.txt
echo "has_changes=true" >> $GITHUB_OUTPUT
else
echo "No dataset changes detected"
echo "has_changes=false" >> $GITHUB_OUTPUT
fi
- name: Validate changed datasets
if: steps.changed-datasets.outputs.has_changes == 'true'
run: |
validation_failed=false
while IFS= read -r dataset_path; do
echo "----------------------------------------"
echo "Validating $dataset_path"
echo "----------------------------------------"
if python scripts/validate.py "$dataset_path"; then
echo "✅ $dataset_path validation passed"
else
echo "❌ $dataset_path validation failed"
validation_failed=true
fi
echo ""
done < changed_datasets.txt
if [ "$validation_failed" = true ]; then
echo "❌ One or more datasets failed validation"
exit 1
else
echo "✅ All datasets validated successfully"
fi
- name: Validate all datasets (on push to master)
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
run: |
echo "Running full validation on master branch..."
validation_failed=false
for dataset_dir in datasets/*/; do
if [ -d "$dataset_dir" ]; then
echo "----------------------------------------"
echo "Validating $dataset_dir"
echo "----------------------------------------"
if python scripts/validate.py "$dataset_dir"; then
echo "✅ $dataset_dir validation passed"
else
echo "❌ $dataset_dir validation failed"
validation_failed=true
fi
echo ""
fi
done
if [ "$validation_failed" = true ]; then
echo "❌ One or more datasets failed validation"
exit 1
else
echo "✅ All datasets validated successfully"
fi
- name: Comment on PR
if: failure() && github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
try {
await github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '❌ Dataset validation failed. Please check the workflow logs for details and ensure:\n\n1. Your JSON is properly formatted (one entry per line)\n2. All entries conform to the schema\n3. Required files are present\n\nRun validation locally with:\n```bash\npython scripts/validate.py datasets/your-dataset-name\n```'
})
} catch (error) {
console.log('Could not create comment:', error.message)
console.log('This is likely due to permissions. The validation still failed - check the Actions tab for details.')
}
- name: Comment on PR success
if: success() && github.event_name == 'pull_request' && steps.changed-datasets.outputs.has_changes == 'true'
uses: actions/github-script@v7
with:
script: |
try {
await github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: '✅ All dataset validations passed! Your changes are ready for review.'
})
} catch (error) {
console.log('Could not create comment:', error.message)
console.log('This is likely due to permissions. The validation passed successfully!')
}