Nightly Commit Mining #128
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly Commit Mining | |
| on: | |
| schedule: | |
| - cron: '0 2 * * *' # Every night at 2 AM | |
| workflow_dispatch: | |
| inputs: | |
| batch_size: | |
| description: 'Number of commits per run (default: 500)' | |
| default: '500' | |
| llm_provider: | |
| description: 'LLM provider (gemini, openai, deepseek, qwen, llama, mistral; auto-detected if not set)' | |
| default: '' | |
| jobs: | |
| mine: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| concurrency: | |
| group: mining-state | |
| cancel-in-progress: false | |
| permissions: | |
| contents: write # Required: commit state.json and deploy report to gh-pages | |
| pull-requests: write # Required: create and auto-merge the state-update PR | |
| steps: | |
| - uses: actions/checkout@v6 | |
| - uses: actions/setup-java@v5 | |
| with: | |
| java-version: '21' | |
| distribution: 'temurin' | |
| cache: maven | |
| - name: Build Mining CLI | |
| run: | | |
| mvn install -N -q | |
| mvn install -pl sandbox_common_core -DskipTests -q | |
| mvn package -pl sandbox_mining_core -DskipTests -q | |
| - name: Run Commit Mining | |
| continue-on-error: true | |
| env: | |
| GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} | |
| GEMINI_MODEL: gemini-2.5-flash | |
| GEMINI_DEBUG: 'true' | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| OPENAI_MODEL: ${{ secrets.OPENAI_MODEL }} | |
| DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} | |
| DEEPSEEK_MODEL: ${{ secrets.DEEPSEEK_MODEL }} | |
| DASHSCOPE_API_KEY: ${{ secrets.DASHSCOPE_API_KEY }} | |
| QWEN_MODEL: ${{ secrets.QWEN_MODEL }} | |
| LLAMA_API_KEY: ${{ secrets.LLAMA_API_KEY }} | |
| LLAMA_MODEL: ${{ secrets.LLAMA_MODEL }} | |
| MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} | |
| MISTRAL_MODEL: ${{ secrets.MISTRAL_MODEL }} | |
| run: | | |
| java -jar sandbox_mining_core/target/sandbox-mining-core.jar \ | |
| --config .github/refactoring-mining/repos.yml \ | |
| --state .github/refactoring-mining/state.json \ | |
| --sandbox-root . \ | |
| --batch-size ${{ github.event.inputs.batch_size || '200' }} \ | |
| --commits-per-request 4 \ | |
| --output docs/mining-report/ \ | |
| ${{ github.event.inputs.llm_provider != '' && format('--llm-provider {0}', github.event.inputs.llm_provider) || '' }} | |
| - name: Post-run diagnostics | |
| if: always() | |
| run: | | |
| STATS_FILE="docs/mining-report/statistics.json" | |
| if [ -f "$STATS_FILE" ]; then | |
| API_CALLS=$(python3 -c "import json; d=json.load(open('$STATS_FILE')); print(d.get('runMetadata',{}).get('apiCallsMade',0))" 2>/dev/null || echo "?") | |
| DEFERRED=$(python3 -c "import json; d=json.load(open('$STATS_FILE')); print(d.get('runMetadata',{}).get('deferredCommits',0))" 2>/dev/null || echo "?") | |
| PROCESSED=$(python3 -c "import json; d=json.load(open('$STATS_FILE')); print(d.get('totalProcessed',0))" 2>/dev/null || echo "?") | |
| echo "### ⛏️ Mining Run Diagnostics" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Metric | Value |" >> $GITHUB_STEP_SUMMARY | |
| echo "|--------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| API Calls | $API_CALLS |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Commits Processed | $PROCESSED |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Deferred Commits | $DEFERRED |" >> $GITHUB_STEP_SUMMARY | |
| if [ "$API_CALLS" = "0" ] && [ "$DEFERRED" != "0" ]; then | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "⚠️ **Warning:** No API calls were made but there are deferred commits." >> $GITHUB_STEP_SUMMARY | |
| echo "Possible causes: API key expired/missing, all diffs filtered out, or API rate-limited." >> $GITHUB_STEP_SUMMARY | |
| fi | |
| else | |
| echo "### ⚠️ Mining Run Diagnostics" >> $GITHUB_STEP_SUMMARY | |
| echo "statistics.json not found — mining may have failed before producing output." >> $GITHUB_STEP_SUMMARY | |
| fi | |
| - name: Commit State | |
| id: commit_state | |
| run: | | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git add .github/refactoring-mining/state.json | |
| # Include known-rules.json if it was created/updated | |
| if [ -f docs/mining-report/known-rules.json ]; then | |
| git add docs/mining-report/known-rules.json | |
| fi | |
| # Include auto-generated .sandbox-hint files from HintFileUpdater | |
| HINT_DIR="sandbox_common_core/src/main/resources/org/sandbox/jdt/triggerpattern/internal" | |
| if [ -d "$HINT_DIR" ]; then | |
| NEW_HINTS=$(git ls-files --others --exclude-standard "$HINT_DIR"/*.sandbox-hint 2>/dev/null || true) | |
| if [ -n "$NEW_HINTS" ]; then | |
| echo "Auto-generated hint files found:" | |
| echo "$NEW_HINTS" | |
| git add $HINT_DIR/*.sandbox-hint | |
| fi | |
| fi | |
| if git diff --cached --quiet; then | |
| echo "No state changes to commit." | |
| echo "state_updated=false" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| BRANCH="mining/state-update" | |
| git stash --include-untracked | |
| git checkout -b "$BRANCH" | |
| if ! git stash pop; then | |
| echo "Failed to apply stashed changes; aborting state update." >&2 | |
| exit 1 | |
| fi | |
| if [ ! -f .github/refactoring-mining/state.json ]; then | |
| echo "state.json not found after applying stash; aborting." >&2 | |
| exit 1 | |
| fi | |
| git add .github/refactoring-mining/state.json | |
| if [ -f docs/mining-report/known-rules.json ]; then | |
| git add docs/mining-report/known-rules.json | |
| fi | |
| # Re-add hint files after stash pop | |
| if [ -d "$HINT_DIR" ]; then | |
| git add $HINT_DIR/*.sandbox-hint 2>/dev/null || true | |
| fi | |
| git commit -m "mining: Update state + known rules + hints $(date +%Y-%m-%d)" | |
| git push origin "$BRANCH" --force | |
| echo "state_updated=true" >> "$GITHUB_OUTPUT" | |
| - name: Create Issues for new GREEN findings | |
| if: always() | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| EVALS_FILE="docs/mining-report/evaluations.json" | |
| KNOWN_FILE="docs/mining-report/known-rules.json" | |
| if [ ! -f "$EVALS_FILE" ]; then | |
| echo "No evaluations file found; skipping issue creation." | |
| exit 0 | |
| fi | |
| # Extract GREEN+VALID evaluations not yet in known-rules.json | |
| python3 -c " | |
| import json, sys | |
| evals = json.load(open('$EVALS_FILE')) | |
| known_commits = set() | |
| if __import__('os').path.exists('$KNOWN_FILE'): | |
| known = json.load(open('$KNOWN_FILE')) | |
| known_commits = {r.get('sourceCommit','') for r in known.get('rules',[])} | |
| new_green = [e for e in evals | |
| if e.get('trafficLight') == 'GREEN' | |
| and e.get('dslValidationResult') == 'VALID' | |
| and e.get('relevant', False) | |
| and e.get('commitHash','') not in known_commits | |
| and e.get('dslRule')] | |
| # Limit to 5 issues per run to avoid issue spam during catch-up phases | |
| for e in new_green[:5]: | |
| print(json.dumps({ | |
| 'summary': e.get('summary','Unknown'), | |
| 'category': e.get('category','Unknown'), | |
| 'dslRule': e.get('dslRule',''), | |
| 'targetHintFile': e.get('targetHintFile',''), | |
| 'commitHash': e.get('commitHash','') | |
| })) | |
| " 2>/dev/null | while IFS= read -r line; do | |
| SUMMARY=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['summary'])") | |
| CATEGORY=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['category'])") | |
| DSL_RULE=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['dslRule'])") | |
| TARGET=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['targetHintFile'])") | |
| COMMIT=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['commitHash'])") | |
| # Check if issue already exists | |
| EXISTING=$(gh issue list --search "🟢 $CATEGORY — $SUMMARY" --json number -q '.[0].number' 2>/dev/null || true) | |
| if [ -n "$EXISTING" ]; then | |
| echo "Issue already exists for: $SUMMARY (#$EXISTING)" | |
| continue | |
| fi | |
| gh issue create \ | |
| --title "🟢 New DSL Rule: $CATEGORY — $SUMMARY" \ | |
| --body "## DSL Rule | |
| \`\`\` | |
| $DSL_RULE | |
| \`\`\` | |
| ## Target File | |
| $TARGET | |
| ## Source | |
| Commit: $COMMIT | |
| Mining Run: #${{ github.run_number }}" \ | |
| --label "dsl-rule,green,automated" 2>/dev/null || echo "Could not create issue for: $SUMMARY" | |
| done | |
| - name: Create Issues for DSL Enhancement needs | |
| if: always() | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| KNOWN_FILE="docs/mining-report/known-rules.json" | |
| if [ ! -f "$KNOWN_FILE" ]; then | |
| echo "No known-rules.json found; skipping DSL enhancement issues." | |
| exit 0 | |
| fi | |
| # Extract rules with NEEDS_DSL_EXTENSION status, group by limitation | |
| python3 -c " | |
| import json, sys | |
| known = json.load(open('$KNOWN_FILE')) | |
| needs_ext = [r for r in known.get('rules', []) if r.get('status') == 'NEEDS_DSL_EXTENSION'] | |
| if not needs_ext: | |
| sys.exit(0) | |
| # Group by simple heuristic on summary | |
| groups = {} | |
| for r in needs_ext: | |
| summary = (r.get('summary') or '').lower() | |
| if 'bitwise' in summary or ' | ' in (r.get('dslRule') or ''): | |
| key = 'Bitwise operators' | |
| elif 'try-with' in summary or 'autocloseable' in summary: | |
| key = 'Statement wrapping (try-with-resources)' | |
| elif 'generic' in summary or 'type parameter' in summary: | |
| key = 'Generics / type-parameterized matching' | |
| elif 'arity' in summary or 'vararg' in summary: | |
| key = 'Arity changes / varargs' | |
| else: | |
| key = 'Other DSL limitations' | |
| groups.setdefault(key, []).append(r) | |
| for key, rules in sorted(groups.items(), key=lambda x: -len(x[1])): | |
| print(json.dumps({'limitation': key, 'count': len(rules)})) | |
| " 2>/dev/null | while IFS= read -r line; do | |
| LIMITATION=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['limitation'])") | |
| COUNT=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])") | |
| # Check if issue already exists | |
| EXISTING=$(gh issue list --search "🔧 DSL Enhancement: $LIMITATION" --json number -q '.[0].number' 2>/dev/null || true) | |
| if [ -n "$EXISTING" ]; then | |
| echo "DSL enhancement issue already exists for: $LIMITATION (#$EXISTING)" | |
| continue | |
| fi | |
| gh issue create \ | |
| --title "🔧 DSL Enhancement: $LIMITATION ($COUNT rules blocked)" \ | |
| --body "## DSL Limitation | |
| **$LIMITATION** | |
| ## Impact | |
| **$COUNT** discovered transformation rules cannot be implemented because of this DSL limitation. | |
| ## Action Needed | |
| Extend the TriggerPattern DSL to support this pattern category. | |
| _Auto-generated by mining workflow run #${{ github.run_number }}._" \ | |
| --label "dsl-enhancement,automated" 2>/dev/null || echo "Could not create issue for: $LIMITATION" | |
| done | |
| - name: Auto-merge state update | |
| if: steps.commit_state.outputs.state_updated == 'true' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| # Close any existing PR from this branch first (keep the branch so we can reuse it) | |
| EXISTING_PR=$(gh pr list --head "mining/state-update" --json number -q '.[0].number' 2>/dev/null || true) | |
| if [ -n "$EXISTING_PR" ]; then | |
| gh pr close "$EXISTING_PR" 2>/dev/null || true | |
| fi | |
| # Create new PR and merge it | |
| gh pr create \ | |
| --base main \ | |
| --head "mining/state-update" \ | |
| --title "mining: Update state $(date +%Y-%m-%d)" \ | |
| --body "Automated state update from mining workflow run #${{ github.run_number }}. This PR is auto-merged to persist the mining progress so the next run continues where this one left off." | |
| gh pr merge "mining/state-update" --squash --auto --delete-branch \ | |
| || gh pr merge "mining/state-update" --squash --delete-branch | |
| - name: Deploy Mining Report to gh-pages | |
| uses: peaceiris/actions-gh-pages@v4 | |
| with: | |
| github_token: ${{ secrets.GITHUB_TOKEN }} | |
| publish_dir: docs/mining-report | |
| publish_branch: gh-pages | |
| destination_dir: mining-report | |
| keep_files: true | |
| user_name: 'github-actions[bot]' | |
| user_email: 'github-actions[bot]@users.noreply.github.com' | |
| commit_message: 'mining: evaluation run ${{ github.run_number }}' | |
| - name: Report Deployed URL | |
| if: always() | |
| run: | | |
| echo "### 🔗 Deployed Report" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "📊 [View Mining Report](https://carstenartur.github.io/sandbox/mining-report/)" >> $GITHUB_STEP_SUMMARY | |
| - name: Upload Artifacts | |
| uses: actions/upload-artifact@v7 | |
| if: always() | |
| with: | |
| name: mining-report-${{ github.run_number }} | |
| path: docs/mining-report/ |