Skip to content

Commit 398089c

Browse files
committed
feat(ingesters): add cairo skills config
feat(python): add cairo skills source metadata types Add CAIRO_SKILLS query resource description feat: add async unique-id fetch helper for retriever docs: add research context for Step 4 SkillsIngester implementation Captures codebase patterns, type definitions, frontmatter parsing approaches, GitHub API call patterns, and open questions (fullContent field gap, class name discrepancy) for the cairo_skills SkillsIngester. feat(ingesters): add cairo skills github ingester and chunks ingesters: guard cairo skill markdown traversal depth ingesters: document frontmatter parser scope ingesters: fetch cairo skill files in parallel batches ingesters: remove unused skills barrel export ingesters: expose skills to subclasses for test setup ingesters: align cairo skills config owner and repo docs: add context document for CairoSkillsIngester TypeScript tests Summarizes all 11 RFC test cases, their coverage status (all passing), test patterns used (Bun test runner, vi mocking, TestCairoSkillsIngester subclass pattern), and key type references for implementation. feat(ingesters): register SkillsIngester for cairo_skills feat(ingesters): register SkillsIngester for cairo_skills feat(rag): expand cairo skill chunks to full documents docs: add context document for integration tests (cairo skills expansion) Captures fixture structure, data-flow trace, mock setup patterns, and per-test implementation guides for the 3 RFC integration test cases. test: add cairo skills integration coverage chore: clean up review findings before merge - Remove trivial enum identity assertion from types.test.ts - Make skillsConfig.test.ts resilient to config changes (no hardcoded length/IDs, validate structure only) - Remove low-value resource description substring test - Move json import to module level in rag_pipeline.py - Remove agent planning doc artifacts from branch unslop
1 parent 6b13043 commit 398089c

12 files changed

Lines changed: 512 additions & 55 deletions

File tree

bun.lock

Lines changed: 10 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ingesters/__tests__/skillsConfig.test.ts

Lines changed: 18 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,12 @@ type SkillsConfigFile = {
1414
const skillsConfigPath = join(import.meta.dir, '..', 'config', 'skills.json');
1515

1616
describe('skills config', () => {
17-
it('matches the RFC Step 3 contract', () => {
17+
it('should contain at least one skill with valid structure', () => {
1818
const raw = readFileSync(skillsConfigPath, 'utf8');
1919
const parsed = JSON.parse(raw) as SkillsConfigFile;
2020

2121
expect(Array.isArray(parsed.skills)).toBe(true);
22-
expect(parsed.skills).toHaveLength(4);
23-
24-
const expectedIds = [
25-
'benchmarking-cairo',
26-
'cairo-coding',
27-
'avnu',
28-
'starknet-defi',
29-
];
30-
31-
expect(parsed.skills.map((skill) => skill.id)).toEqual(expectedIds);
22+
expect(parsed.skills.length).toBeGreaterThan(0);
3223

3324
for (const skill of parsed.skills) {
3425
expect(typeof skill.id).toBe('string');
@@ -39,23 +30,22 @@ describe('skills config', () => {
3930
expect(url.protocol).toBe('https:');
4031
expect(url.hostname).toBe('github.com');
4132
}
33+
});
4234

43-
const benchmarkingCairoUrl = parsed.skills.find(
44-
(skill) => skill.id === 'benchmarking-cairo',
45-
)?.url;
46-
expect(benchmarkingCairoUrl).toBeDefined();
47-
expect(benchmarkingCairoUrl as string).toContain('/tree/');
48-
49-
const cairoCodingUrl = parsed.skills.find(
50-
(skill) => skill.id === 'cairo-coding',
51-
)?.url;
52-
expect(cairoCodingUrl).toBeDefined();
53-
expect(cairoCodingUrl as string).toContain('/tree/');
54-
55-
const starknetDefiUrl = parsed.skills.find(
56-
(skill) => skill.id === 'starknet-defi',
57-
)?.url;
58-
expect(starknetDefiUrl).toBeDefined();
59-
expect(starknetDefiUrl as string).toMatch(/\/blob\/[0-9a-f]{40}\//);
35+
it('should have unique skill ids', () => {
36+
const raw = readFileSync(skillsConfigPath, 'utf8');
37+
const parsed = JSON.parse(raw) as SkillsConfigFile;
38+
39+
const ids = parsed.skills.map((skill) => skill.id);
40+
expect(new Set(ids).size).toBe(ids.length);
41+
});
42+
43+
it('should use /tree/ or /blob/ GitHub URL formats', () => {
44+
const raw = readFileSync(skillsConfigPath, 'utf8');
45+
const parsed = JSON.parse(raw) as SkillsConfigFile;
46+
47+
for (const skill of parsed.skills) {
48+
expect(skill.url).toMatch(/\/(tree|blob)\//);
49+
}
6050
});
6151
});

ingesters/__tests__/types.test.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,6 @@ import { describe, expect, it } from 'bun:test';
22
import { DocumentSource, type BookChunk } from '../src/types';
33

44
describe('types', () => {
5-
it('exposes cairo_skills document source', () => {
6-
expect(String(DocumentSource.CAIRO_SKILLS)).toBe('cairo_skills');
7-
});
8-
95
it('supports optional skillId on BookChunk', () => {
106
const withoutSkillId: BookChunk = {
117
name: 'example',

ingesters/config/sources.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,21 @@
150150
"urlSuffix": "",
151151
"useUrlMapping": true
152152
}
153+
},
154+
"cairo_skills": {
155+
"name": "Cairo Skills",
156+
"description": "Curated Cairo ecosystem skills for all-or-nothing retrieval",
157+
"ingesterClass": "SkillsIngester",
158+
"config": {
159+
"repoOwner": "",
160+
"repoName": "",
161+
"fileExtensions": [".md"],
162+
"chunkSize": 4096,
163+
"chunkOverlap": 512,
164+
"baseUrl": "",
165+
"urlSuffix": "",
166+
"useUrlMapping": false
167+
}
153168
}
154169
}
155170
}

ingesters/src/IngesterFactory.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { ScarbDocsIngester } from './ingesters/ScarbDocsIngester';
1010
import { StarknetJSIngester } from './ingesters/StarknetJSIngester';
1111
import { StarknetBlogIngester } from './ingesters/StarknetBlogIngester';
1212
import { DojoDocsIngester } from './ingesters/DojoDocsIngester';
13-
import { CairoSkillsIngester } from './ingesters/CairoSkillsIngester';
13+
import { SkillsIngester } from './ingesters/SkillsIngester';
1414
import {
1515
getAvailableSourcesFromConfig,
1616
getSourceConfig,
@@ -34,7 +34,7 @@ const INGESTER_CLASSES: Record<string, new () => BaseIngester> = {
3434
StarknetJSIngester,
3535
StarknetBlogIngester,
3636
DojoDocsIngester,
37-
CairoSkillsIngester,
37+
SkillsIngester,
3838
};
3939

4040
/**
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import { IngesterFactory } from '../IngesterFactory';
2+
import { DocumentSource } from '../types';
3+
import { SkillsIngester } from '../ingesters/SkillsIngester';
4+
import { getSourceConfig } from '../utils/sourceConfig';
5+
6+
describe('IngesterFactory cairo_skills wiring', () => {
7+
it('loads cairo_skills source config with SkillsIngester metadata', () => {
8+
const sourceConfig = getSourceConfig(DocumentSource.CAIRO_SKILLS);
9+
10+
expect(sourceConfig.name).toBe('Cairo Skills');
11+
expect(sourceConfig.ingesterClass).toBe('SkillsIngester');
12+
expect(sourceConfig.config).toEqual({
13+
repoOwner: '',
14+
repoName: '',
15+
fileExtensions: ['.md'],
16+
chunkSize: 4096,
17+
chunkOverlap: 512,
18+
baseUrl: '',
19+
urlSuffix: '',
20+
useUrlMapping: false,
21+
});
22+
});
23+
24+
it('creates a SkillsIngester for cairo_skills', () => {
25+
const ingester = IngesterFactory.createIngester(
26+
DocumentSource.CAIRO_SKILLS,
27+
);
28+
29+
expect(ingester).toBeInstanceOf(SkillsIngester);
30+
});
31+
});
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export { CairoSkillsIngester as SkillsIngester } from './CairoSkillsIngester';

package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
"dependencies": {
88
"@ai-sdk/anthropic": "^3.0.44",
99
"ai": "^6.0.86",
10-
"smithers-orchestrator": "^0.6.0",
11-
"takopi-smithers": "github:evmts/takopi-smithers",
10+
"smithers-orchestrator": "^0.9.0",
1211
"zod": "^4.3.6"
1312
},
1413
"patchedDependencies": {

python/src/cairo_coder/core/rag_pipeline.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import asyncio
99
import contextlib
10+
import json
1011
import os
1112
from collections.abc import AsyncGenerator
1213
from dataclasses import dataclass
@@ -139,6 +140,8 @@ async def _aprocess_query_and_retrieve_docs(
139140
)
140141
# documents already contains all retrieved docs, no action needed
141142

143+
documents = await self._expand_skill_documents(documents)
144+
142145
# Ensure Grok summary is present and first in order (for generation context)
143146
if grok_summary_doc is not None:
144147
if grok_summary_doc in documents:
@@ -150,6 +153,84 @@ async def _aprocess_query_and_retrieve_docs(
150153

151154
return processed_query, documents, grok_citations
152155

156+
async def _expand_skill_documents(self, documents: list[Document]) -> list[Document]:
157+
"""
158+
Replace skill chunks with full skill documents when available.
159+
160+
If a full document row cannot be fetched for a skill, keep that skill's
161+
original chunks to degrade gracefully.
162+
"""
163+
skill_chunks = [
164+
document
165+
for document in documents
166+
if document.metadata.get("source") == DocumentSource.CAIRO_SKILLS
167+
and document.metadata.get("skillId")
168+
]
169+
if not skill_chunks:
170+
return documents
171+
172+
skill_ids = list(dict.fromkeys(doc.metadata["skillId"] for doc in skill_chunks))
173+
unique_ids = [f"skill-{skill_id}-full" for skill_id in skill_ids]
174+
175+
try:
176+
rows = await self.document_retriever.vector_db.afetch_by_unique_ids(unique_ids)
177+
except Exception as e:
178+
logger.warning(
179+
"_expand_skill_documents: failed to fetch full rows, keeping original chunks",
180+
error=str(e),
181+
exc_info=True,
182+
)
183+
return documents
184+
185+
full_documents_by_skill_id: dict[str, Document] = {}
186+
for row in rows:
187+
metadata: Any = row.get("metadata", {})
188+
if isinstance(metadata, str):
189+
try:
190+
metadata = json.loads(metadata)
191+
except Exception:
192+
logger.warning(
193+
"_expand_skill_documents: unable to decode metadata json, skipping row"
194+
)
195+
continue
196+
197+
if not isinstance(metadata, dict):
198+
continue
199+
200+
skill_id = metadata.get("skillId")
201+
full_content = metadata.get("fullContent")
202+
if skill_id and full_content:
203+
full_documents_by_skill_id[skill_id] = Document(
204+
page_content=full_content,
205+
metadata=metadata,
206+
)
207+
208+
result_documents = [
209+
document
210+
for document in documents
211+
if document.metadata.get("source") != DocumentSource.CAIRO_SKILLS
212+
]
213+
214+
found_skill_ids = set(full_documents_by_skill_id)
215+
for skill_id in skill_ids:
216+
if skill_id not in found_skill_ids:
217+
original_chunks = [
218+
document
219+
for document in skill_chunks
220+
if document.metadata.get("skillId") == skill_id
221+
]
222+
result_documents.extend(original_chunks)
223+
logger.warning(
224+
"_expand_skill_documents: no full document found, keeping chunks",
225+
skill_id=skill_id,
226+
)
227+
228+
for skill_id in skill_ids:
229+
if skill_id in full_documents_by_skill_id:
230+
result_documents.append(full_documents_by_skill_id[skill_id])
231+
232+
return result_documents
233+
153234
@traceable(name="RagPipeline", run_type="chain")
154235
async def aforward(
155236
self,

0 commit comments

Comments
 (0)