Skip to content

Commit dcbf15e

Browse files
authored
ci: persist staged mining candidates (#1171)
1 parent d785e19 commit dcbf15e

11 files changed

Lines changed: 1075 additions & 421 deletions

File tree

.github/workflows/mining-core.yml

Lines changed: 87 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ jobs:
3838
mvn install -pl sandbox_common_core -DskipTests -q
3939
mvn package -pl sandbox_mining_core -DskipTests -q
4040
41+
- name: Snapshot known rules before mining
42+
run: |
43+
if [ -f docs/mining-report/known-rules.json ]; then
44+
cp docs/mining-report/known-rules.json "$RUNNER_TEMP/known-rules-before.json"
45+
else
46+
printf '{"version":1,"rules":[]}' > "$RUNNER_TEMP/known-rules-before.json"
47+
fi
48+
4149
- name: Run Commit Mining
4250
continue-on-error: true
4351
env:
@@ -64,6 +72,71 @@ jobs:
6472
--output docs/mining-report/ \
6573
${{ github.event.inputs.llm_provider != '' && format('--llm-provider {0}', github.event.inputs.llm_provider) || '' }}
6674
75+
- name: Reconcile staged mining candidates
76+
if: always()
77+
run: |
78+
BEFORE_KNOWN="$RUNNER_TEMP/known-rules-before.json"
79+
CURRENT_KNOWN="docs/mining-report/known-rules.json"
80+
CANDIDATE_DIR="mining-candidates"
81+
if [ ! -f "$CURRENT_KNOWN" ]; then
82+
echo "No known-rules.json found after mining; skipping candidate reconciliation."
83+
exit 0
84+
fi
85+
python3 - "$BEFORE_KNOWN" "$CURRENT_KNOWN" "$CANDIDATE_DIR" <<'PY'
86+
import json
87+
import pathlib
88+
import sys
89+
90+
before_path = pathlib.Path(sys.argv[1])
91+
known_path = pathlib.Path(sys.argv[2])
92+
candidate_dir = pathlib.Path(sys.argv[3])
93+
94+
def load_rules(path):
95+
if not path.exists():
96+
return {"version": 1, "rules": []}
97+
with path.open(encoding="utf-8") as handle:
98+
return json.load(handle)
99+
100+
before = load_rules(before_path)
101+
current = load_rules(known_path)
102+
before_commits = {
103+
rule.get("sourceCommit")
104+
for rule in before.get("rules", [])
105+
if rule.get("sourceCommit")
106+
}
107+
108+
candidate_commits = set()
109+
if candidate_dir.exists():
110+
for candidate_path in candidate_dir.glob("*-candidate.json"):
111+
try:
112+
with candidate_path.open(encoding="utf-8") as handle:
113+
candidate = json.load(handle)
114+
commit = candidate.get("sourceCommit")
115+
if commit:
116+
candidate_commits.add(commit)
117+
except Exception as exc:
118+
print(f"Warning: could not read candidate {candidate_path}: {exc}", file=sys.stderr)
119+
120+
rules = current.get("rules", [])
121+
kept = [
122+
rule for rule in rules
123+
if rule.get("sourceCommit") in before_commits
124+
or rule.get("sourceCommit") in candidate_commits
125+
]
126+
removed = len(rules) - len(kept)
127+
if removed:
128+
current["rules"] = kept
129+
with known_path.open("w", encoding="utf-8") as handle:
130+
json.dump(current, handle, indent=2)
131+
handle.write("\n")
132+
print(
133+
"Candidate reconciliation: "
134+
f"kept {len(kept)} known rules, "
135+
f"removed {removed} non-staged new rules, "
136+
f"candidate commits {len(candidate_commits)}"
137+
)
138+
PY
139+
67140
- name: Post-run diagnostics
68141
if: always()
69142
run: |
@@ -99,6 +172,11 @@ jobs:
99172
if [ -f docs/mining-report/known-rules.json ]; then
100173
git add docs/mining-report/known-rules.json
101174
fi
175+
# Include staged mining candidates if they were created/updated
176+
CANDIDATE_DIR="mining-candidates"
177+
if [ -d "$CANDIDATE_DIR" ]; then
178+
git add "$CANDIDATE_DIR"/*.json 2>/dev/null || true
179+
fi
102180
# Include auto-generated .sandbox-hint files from HintFileUpdater
103181
HINT_DIR="sandbox_common_core/src/main/resources/org/sandbox/jdt/triggerpattern/internal"
104182
if [ -d "$HINT_DIR" ]; then
@@ -129,11 +207,15 @@ jobs:
129207
if [ -f docs/mining-report/known-rules.json ]; then
130208
git add docs/mining-report/known-rules.json
131209
fi
210+
# Re-add staged mining candidates after stash pop
211+
if [ -d "$CANDIDATE_DIR" ]; then
212+
git add "$CANDIDATE_DIR"/*.json 2>/dev/null || true
213+
fi
132214
# Re-add hint files after stash pop
133215
if [ -d "$HINT_DIR" ]; then
134216
git add $HINT_DIR/*.sandbox-hint 2>/dev/null || true
135217
fi
136-
git commit -m "mining: Update state + known rules + hints $(date +%Y-%m-%d)"
218+
git commit -m "mining: Update state + known rules + candidates $(date +%Y-%m-%d)"
137219
git push origin "$BRANCH" --force
138220
echo "state_updated=true" >> "$GITHUB_OUTPUT"
139221
@@ -298,4 +380,7 @@ jobs:
298380
if: always()
299381
with:
300382
name: mining-report-${{ github.run_number }}
301-
path: docs/mining-report/
383+
path: |
384+
docs/mining-report/
385+
mining-candidates/
386+
if-no-files-found: ignore
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Mining candidate integration tests
2+
3+
This note outlines a safe way to add optional integration tests for the staged mining pipeline.
4+
5+
## Goal
6+
7+
The normal test suite should remain deterministic and offline. Gemini-backed checks should be opt-in and only run when explicitly requested with credentials.
8+
9+
A useful integration test should verify the complete path for a small, known commit:
10+
11+
1. check out or clone the configured source repository,
12+
2. extract the diff for a pinned commit,
13+
3. build the mining prompt,
14+
4. call the configured LLM provider,
15+
5. validate the returned DSL rule,
16+
6. stage a `MiningCandidate`,
17+
7. assert that `known-rules.json` is only updated for the persisted candidate.
18+
19+
## FQN-aware DSL style
20+
21+
Candidate rules should normally express concrete APIs with fully qualified names. The matcher should accept target code that uses imports or simple names when imports or bindings prove that the code refers to the same API.
22+
23+
For example, a rule written for `java.util.Collections.emptyList()` should also match source code that imports `java.util.Collections` and calls `Collections.emptyList()`.
24+
25+
The same principle should apply to constructors, type references, method receivers, argument types, return types and overload-sensitive API migrations wherever the equivalence is knowable. This keeps mined rules readable and avoids duplicating rules for different source-code presentation forms.
26+
27+
## Suggested execution model
28+
29+
Use a JUnit 5 tag such as `@Tag("gemini-integration")` and gate the test with environment variables:
30+
31+
- `GEMINI_API_KEY`
32+
- `RUN_GEMINI_INTEGRATION=true`
33+
- optionally `GEMINI_MODEL`, defaulting to the model used by the workflow
34+
35+
The Maven invocation should be explicit, for example:
36+
37+
```bash
38+
RUN_GEMINI_INTEGRATION=true \
39+
GEMINI_API_KEY=... \
40+
mvn -pl sandbox_mining_core \
41+
-Dgroups=gemini-integration \
42+
-DskipTests=false \
43+
test
44+
```
45+
46+
The CI default must not run this tag.
47+
48+
## Recommended fixtures
49+
50+
Use a small set of pinned commits that are known to contain generalizable rules and keep each fixture narrow. Good candidates are rules already represented in `docs/mining-report/known-rules.json`, for example:
51+
52+
- Vector constructor modernization to ArrayList, guarded by local assignment context.
53+
- `String.replaceAll("\\n", replacement)` to `String.replace("\\n", replacement)` for literal non-regex replacement.
54+
- Deprecated wrapper constructors such as `new Float(x)` or `new Double(x)` to `valueOf(x)`.
55+
56+
Each fixture should define expected properties rather than exact prose:
57+
58+
- expected traffic light: `GREEN`
59+
- expected `dslValidationResult`: `VALID`
60+
- expected target hint file or category
61+
- required DSL fragments, not the entire exact LLM response
62+
- before/after/negative examples when available
63+
64+
## Reproducibility notes
65+
66+
LLM responses are probabilistic, so the assertions should not depend on exact wording. The useful measurement is whether the model still finds a valid, general rule under the current prompt and budget.
67+
68+
For cognitive-load/regression tracking, store a small JSON summary as a build artifact:
69+
70+
- model name
71+
- prompt size
72+
- diff size
73+
- latency
74+
- token usage if available
75+
- traffic light
76+
- DSL validation result
77+
- candidate ID
78+
79+
This makes prompt regressions visible without committing live LLM output into the repository.

sandbox_common_core/src/main/java/org/sandbox/jdt/triggerpattern/api/PatternIndex.java

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import org.eclipse.jdt.core.dom.MethodInvocation;
3333
import org.eclipse.jdt.core.dom.Statement;
3434
import org.eclipse.jdt.core.dom.VariableDeclarationStatement;
35+
import org.sandbox.jdt.triggerpattern.internal.FqnAwarePlaceholderAstMatcher;
3536
import org.sandbox.jdt.triggerpattern.internal.PatternParser;
3637
import org.sandbox.jdt.triggerpattern.internal.PlaceholderAstMatcher;
3738

@@ -204,7 +205,7 @@ private void matchAgainstKind(ASTNode node, PatternKind kind,
204205
}
205206

206207
for (IndexEntry entry : entries) {
207-
PlaceholderAstMatcher matcher = new PlaceholderAstMatcher();
208+
PlaceholderAstMatcher matcher = new FqnAwarePlaceholderAstMatcher();
208209
matcher.setCaseInsensitive(caseInsensitive);
209210
if (entry.patternNode().subtreeMatch(matcher, node)) {
210211
Match match = new Match(node, matcher.getBindings(),
@@ -239,35 +240,25 @@ private void matchStatementSequences(Block block,
239240
continue;
240241
}
241242

242-
// Sliding window
243-
for (int i = 0; i <= statements.size() - patternSize; i++) {
244-
boolean allMatch = true;
245-
PlaceholderAstMatcher combinedMatcher = new PlaceholderAstMatcher();
246-
combinedMatcher.setCaseInsensitive(caseInsensitive);
247-
for (int j = 0; j < patternSize; j++) {
248-
PlaceholderAstMatcher matcher = new PlaceholderAstMatcher();
249-
matcher.setCaseInsensitive(caseInsensitive);
250-
if (!patternStatements.get(j).subtreeMatch(matcher, statements.get(i + j))) {
251-
allMatch = false;
252-
break;
253-
}
254-
combinedMatcher.mergeBindings(matcher);
243+
for (int start = 0; start <= statements.size() - patternSize; start++) {
244+
Block syntheticBlock = block.getAST().newBlock();
245+
for (int i = 0; i < patternSize; i++) {
246+
syntheticBlock.statements().add(ASTNode.copySubtree(block.getAST(), statements.get(start + i)));
255247
}
256-
if (allMatch) {
257-
Statement first = statements.get(i);
258-
Statement last = statements.get(i + patternSize - 1);
259-
int offset = first.getStartPosition();
260-
int length = (last.getStartPosition() + last.getLength()) - offset;
261-
Match match = new Match(first, combinedMatcher.getBindings(), offset, length);
248+
249+
PlaceholderAstMatcher matcher = new FqnAwarePlaceholderAstMatcher();
250+
matcher.setCaseInsensitive(caseInsensitive);
251+
if (patternBlock.subtreeMatch(matcher, syntheticBlock)) {
252+
int offset = statements.get(start).getStartPosition();
253+
Statement last = statements.get(start + patternSize - 1);
254+
int length = last.getStartPosition() + last.getLength() - offset;
255+
Match match = new Match(block, matcher.getBindings(), offset, length);
262256
results.computeIfAbsent(entry.rule(), r -> new ArrayList<>()).add(match);
263257
}
264258
}
265259
}
266260
}
267261

268-
/**
269-
* An entry in the pattern index, containing a rule and its pre-parsed pattern node.
270-
*/
271262
private record IndexEntry(TransformationRule rule, Pattern sourcePattern, ASTNode patternNode) {
272263
}
273264
}

0 commit comments

Comments
 (0)