Skip to content

Nightly Commit Mining #127

Nightly Commit Mining

Nightly Commit Mining #127

Workflow file for this run

name: Nightly Commit Mining
on:
schedule:
- cron: '0 2 * * *' # Every night at 2 AM
workflow_dispatch:
inputs:
batch_size:
description: 'Number of commits per run (default: 500)'
default: '500'
llm_provider:
description: 'LLM provider (gemini, openai, deepseek, qwen, llama, mistral; auto-detected if not set)'
default: ''
jobs:
mine:
runs-on: ubuntu-latest
timeout-minutes: 60
concurrency:
group: mining-state
cancel-in-progress: false
permissions:
contents: write # Required: commit state.json and deploy report to gh-pages
pull-requests: write # Required: create and auto-merge the state-update PR
steps:
- uses: actions/checkout@v6
- uses: actions/setup-java@v5
with:
java-version: '21'
distribution: 'temurin'
cache: maven
- name: Build Mining CLI
run: |
mvn install -N -q
mvn install -pl sandbox_common_core -DskipTests -q
mvn package -pl sandbox_mining_core -DskipTests -q
- name: Run Commit Mining
continue-on-error: true
env:
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
GEMINI_MODEL: gemini-2.5-flash
GEMINI_DEBUG: 'true'
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_MODEL: ${{ secrets.OPENAI_MODEL }}
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
DEEPSEEK_MODEL: ${{ secrets.DEEPSEEK_MODEL }}
DASHSCOPE_API_KEY: ${{ secrets.DASHSCOPE_API_KEY }}
QWEN_MODEL: ${{ secrets.QWEN_MODEL }}
LLAMA_API_KEY: ${{ secrets.LLAMA_API_KEY }}
LLAMA_MODEL: ${{ secrets.LLAMA_MODEL }}
MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}
MISTRAL_MODEL: ${{ secrets.MISTRAL_MODEL }}
run: |
java -jar sandbox_mining_core/target/sandbox-mining-core.jar \
--config .github/refactoring-mining/repos.yml \
--state .github/refactoring-mining/state.json \
--sandbox-root . \
--batch-size ${{ github.event.inputs.batch_size || '200' }} \
--commits-per-request 4 \
--output docs/mining-report/ \
${{ github.event.inputs.llm_provider != '' && format('--llm-provider {0}', github.event.inputs.llm_provider) || '' }}
- name: Post-run diagnostics
if: always()
run: |
STATS_FILE="docs/mining-report/statistics.json"
if [ -f "$STATS_FILE" ]; then
API_CALLS=$(python3 -c "import json; d=json.load(open('$STATS_FILE')); print(d.get('runMetadata',{}).get('apiCallsMade',0))" 2>/dev/null || echo "?")
DEFERRED=$(python3 -c "import json; d=json.load(open('$STATS_FILE')); print(d.get('runMetadata',{}).get('deferredCommits',0))" 2>/dev/null || echo "?")
PROCESSED=$(python3 -c "import json; d=json.load(open('$STATS_FILE')); print(d.get('totalProcessed',0))" 2>/dev/null || echo "?")
echo "### ⛏️ Mining Run Diagnostics" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Metric | Value |" >> $GITHUB_STEP_SUMMARY
echo "|--------|-------|" >> $GITHUB_STEP_SUMMARY
echo "| API Calls | $API_CALLS |" >> $GITHUB_STEP_SUMMARY
echo "| Commits Processed | $PROCESSED |" >> $GITHUB_STEP_SUMMARY
echo "| Deferred Commits | $DEFERRED |" >> $GITHUB_STEP_SUMMARY
if [ "$API_CALLS" = "0" ] && [ "$DEFERRED" != "0" ]; then
echo "" >> $GITHUB_STEP_SUMMARY
echo "⚠️ **Warning:** No API calls were made but there are deferred commits." >> $GITHUB_STEP_SUMMARY
echo "Possible causes: API key expired/missing, all diffs filtered out, or API rate-limited." >> $GITHUB_STEP_SUMMARY
fi
else
echo "### ⚠️ Mining Run Diagnostics" >> $GITHUB_STEP_SUMMARY
echo "statistics.json not found — mining may have failed before producing output." >> $GITHUB_STEP_SUMMARY
fi
- name: Commit State
id: commit_state
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add .github/refactoring-mining/state.json
# Include known-rules.json if it was created/updated
if [ -f docs/mining-report/known-rules.json ]; then
git add docs/mining-report/known-rules.json
fi
# Include auto-generated .sandbox-hint files from HintFileUpdater
HINT_DIR="sandbox_common_core/src/main/resources/org/sandbox/jdt/triggerpattern/internal"
if [ -d "$HINT_DIR" ]; then
NEW_HINTS=$(git ls-files --others --exclude-standard "$HINT_DIR"/*.sandbox-hint 2>/dev/null || true)
if [ -n "$NEW_HINTS" ]; then
echo "Auto-generated hint files found:"
echo "$NEW_HINTS"
git add $HINT_DIR/*.sandbox-hint
fi
fi
if git diff --cached --quiet; then
echo "No state changes to commit."
echo "state_updated=false" >> "$GITHUB_OUTPUT"
exit 0
fi
BRANCH="mining/state-update"
git stash --include-untracked
git checkout -b "$BRANCH"
if ! git stash pop; then
echo "Failed to apply stashed changes; aborting state update." >&2
exit 1
fi
if [ ! -f .github/refactoring-mining/state.json ]; then
echo "state.json not found after applying stash; aborting." >&2
exit 1
fi
git add .github/refactoring-mining/state.json
if [ -f docs/mining-report/known-rules.json ]; then
git add docs/mining-report/known-rules.json
fi
# Re-add hint files after stash pop
if [ -d "$HINT_DIR" ]; then
git add $HINT_DIR/*.sandbox-hint 2>/dev/null || true
fi
git commit -m "mining: Update state + known rules + hints $(date +%Y-%m-%d)"
git push origin "$BRANCH" --force
echo "state_updated=true" >> "$GITHUB_OUTPUT"
- name: Create Issues for new GREEN findings
if: always()
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
EVALS_FILE="docs/mining-report/evaluations.json"
KNOWN_FILE="docs/mining-report/known-rules.json"
if [ ! -f "$EVALS_FILE" ]; then
echo "No evaluations file found; skipping issue creation."
exit 0
fi
# Extract GREEN+VALID evaluations not yet in known-rules.json
python3 -c "
import json, sys
evals = json.load(open('$EVALS_FILE'))
known_commits = set()
if __import__('os').path.exists('$KNOWN_FILE'):
known = json.load(open('$KNOWN_FILE'))
known_commits = {r.get('sourceCommit','') for r in known.get('rules',[])}
new_green = [e for e in evals
if e.get('trafficLight') == 'GREEN'
and e.get('dslValidationResult') == 'VALID'
and e.get('relevant', False)
and e.get('commitHash','') not in known_commits
and e.get('dslRule')]
# Limit to 5 issues per run to avoid issue spam during catch-up phases
for e in new_green[:5]:
print(json.dumps({
'summary': e.get('summary','Unknown'),
'category': e.get('category','Unknown'),
'dslRule': e.get('dslRule',''),
'targetHintFile': e.get('targetHintFile',''),
'commitHash': e.get('commitHash','')
}))
" 2>/dev/null | while IFS= read -r line; do
SUMMARY=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['summary'])")
CATEGORY=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['category'])")
DSL_RULE=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['dslRule'])")
TARGET=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['targetHintFile'])")
COMMIT=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['commitHash'])")
# Check if issue already exists
EXISTING=$(gh issue list --search "🟢 $CATEGORY — $SUMMARY" --json number -q '.[0].number' 2>/dev/null || true)
if [ -n "$EXISTING" ]; then
echo "Issue already exists for: $SUMMARY (#$EXISTING)"
continue
fi
gh issue create \
--title "🟢 New DSL Rule: $CATEGORY — $SUMMARY" \
--body "## DSL Rule
\`\`\`
$DSL_RULE
\`\`\`
## Target File
$TARGET
## Source
Commit: $COMMIT
Mining Run: #${{ github.run_number }}" \
--label "dsl-rule,green,automated" 2>/dev/null || echo "Could not create issue for: $SUMMARY"
done
- name: Create Issues for DSL Enhancement needs
if: always()
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
KNOWN_FILE="docs/mining-report/known-rules.json"
if [ ! -f "$KNOWN_FILE" ]; then
echo "No known-rules.json found; skipping DSL enhancement issues."
exit 0
fi
# Extract rules with NEEDS_DSL_EXTENSION status, group by limitation
python3 -c "
import json, sys
known = json.load(open('$KNOWN_FILE'))
needs_ext = [r for r in known.get('rules', []) if r.get('status') == 'NEEDS_DSL_EXTENSION']
if not needs_ext:
sys.exit(0)
# Group by simple heuristic on summary
groups = {}
for r in needs_ext:
summary = (r.get('summary') or '').lower()
if 'bitwise' in summary or ' | ' in (r.get('dslRule') or ''):
key = 'Bitwise operators'
elif 'try-with' in summary or 'autocloseable' in summary:
key = 'Statement wrapping (try-with-resources)'
elif 'generic' in summary or 'type parameter' in summary:
key = 'Generics / type-parameterized matching'
elif 'arity' in summary or 'vararg' in summary:
key = 'Arity changes / varargs'
else:
key = 'Other DSL limitations'
groups.setdefault(key, []).append(r)
for key, rules in sorted(groups.items(), key=lambda x: -len(x[1])):
print(json.dumps({'limitation': key, 'count': len(rules)}))
" 2>/dev/null | while IFS= read -r line; do
LIMITATION=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['limitation'])")
COUNT=$(echo "$line" | python3 -c "import json,sys; print(json.load(sys.stdin)['count'])")
# Check if issue already exists
EXISTING=$(gh issue list --search "🔧 DSL Enhancement: $LIMITATION" --json number -q '.[0].number' 2>/dev/null || true)
if [ -n "$EXISTING" ]; then
echo "DSL enhancement issue already exists for: $LIMITATION (#$EXISTING)"
continue
fi
gh issue create \
--title "🔧 DSL Enhancement: $LIMITATION ($COUNT rules blocked)" \
--body "## DSL Limitation
**$LIMITATION**
## Impact
**$COUNT** discovered transformation rules cannot be implemented because of this DSL limitation.
## Action Needed
Extend the TriggerPattern DSL to support this pattern category.
_Auto-generated by mining workflow run #${{ github.run_number }}._" \
--label "dsl-enhancement,automated" 2>/dev/null || echo "Could not create issue for: $LIMITATION"
done
- name: Auto-merge state update
if: steps.commit_state.outputs.state_updated == 'true'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Close any existing PR from this branch first (keep the branch so we can reuse it)
EXISTING_PR=$(gh pr list --head "mining/state-update" --json number -q '.[0].number' 2>/dev/null || true)
if [ -n "$EXISTING_PR" ]; then
gh pr close "$EXISTING_PR" 2>/dev/null || true
fi
# Create new PR and merge it
gh pr create \
--base main \
--head "mining/state-update" \
--title "mining: Update state $(date +%Y-%m-%d)" \
--body "Automated state update from mining workflow run #${{ github.run_number }}. This PR is auto-merged to persist the mining progress so the next run continues where this one left off."
gh pr merge "mining/state-update" --squash --auto --delete-branch \
|| gh pr merge "mining/state-update" --squash --delete-branch
- name: Deploy Mining Report to gh-pages
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: docs/mining-report
publish_branch: gh-pages
destination_dir: mining-report
keep_files: true
user_name: 'github-actions[bot]'
user_email: 'github-actions[bot]@users.noreply.github.com'
commit_message: 'mining: evaluation run ${{ github.run_number }}'
- name: Report Deployed URL
if: always()
run: |
echo "### 🔗 Deployed Report" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "📊 [View Mining Report](https://carstenartur.github.io/sandbox/mining-report/)" >> $GITHUB_STEP_SUMMARY
- name: Upload Artifacts
uses: actions/upload-artifact@v7
if: always()
with:
name: mining-report-${{ github.run_number }}
path: docs/mining-report/