Skip to content

Commit f3153ad

Browse files
authored
Merge pull request #21 from arvoreeducacao/joaobarros-/-mgc-magalu-docs-semantic-search
feat(mgc): add semantic search for Magalu docs
2 parents 6fae3a9 + af64965 commit f3153ad

6 files changed

Lines changed: 302 additions & 2 deletions

File tree

packages/mgc/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ node_modules/
22
dist/
33
*.js.map
44
.env
5+
docs-cache/

packages/mgc/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Add to your MCP client config:
2727
| Variable | Description | Default |
2828
|----------|-------------|---------|
2929
| `MGC_CLI_PATH` | Custom path to the mgc binary | `mgc` |
30+
| `MAGALU_DOCS_DIR` | Path to scraped Magalu docs (enables doc search tools) ||
3031

3132
## Available Tools
3233

@@ -68,6 +69,20 @@ Add to your MCP client config:
6869
- **mgc_block_storage_volume_list** — List volumes
6970
- **mgc_block_storage_volume_create** — Create a volume
7071

72+
### Documentation Search
73+
- **search_magalu_docs** — Semantic search across Magalu developer docs (requires `MAGALU_DOCS_DIR`)
74+
- **get_magalu_doc** — Get full markdown content of a doc page (use search first to find filepath)
75+
76+
#### Scraping docs
77+
78+
Use `docusaurus-to-md` to scrape the Magalu docs into a local directory:
79+
80+
```bash
81+
npx @arvoretech/docusaurus-to-md https://dev.magalu.com -o ./docs-cache
82+
```
83+
84+
Then set `MAGALU_DOCS_DIR` to the output path when running the MCP server.
85+
7186
## Development
7287

7388
```bash

packages/mgc/src/docs-index.ts

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import { readdir, readFile } from "node:fs/promises";
2+
import { join, relative } from "node:path";
3+
4+
interface IndexedDoc {
5+
filepath: string;
6+
url: string;
7+
title: string;
8+
content: string;
9+
terms: Map<string, number>;
10+
termCount: number;
11+
}
12+
13+
export class DocsIndex {
14+
private docs: IndexedDoc[] = [];
15+
private idf: Map<string, number> = new Map();
16+
private loaded = false;
17+
private docsDir: string;
18+
19+
constructor(docsDir: string) {
20+
this.docsDir = docsDir;
21+
}
22+
23+
async load(): Promise<void> {
24+
if (this.loaded) return;
25+
26+
const files = await this.findMarkdownFiles(this.docsDir);
27+
28+
const manifestMap = new Map<string, string>();
29+
try {
30+
const manifestRaw = await readFile(join(this.docsDir, "_manifest.json"), "utf-8");
31+
const manifest: Array<{ url: string; filepath: string }> = JSON.parse(manifestRaw);
32+
for (const entry of manifest) {
33+
manifestMap.set(entry.filepath, entry.url);
34+
}
35+
} catch (_e) { /* manifest may not exist */ }
36+
37+
for (const file of files) {
38+
if (file.endsWith("_all.md") || file.endsWith("_manifest.json")) continue;
39+
40+
const content = await readFile(file, "utf-8");
41+
const title = this.extractTitle(content);
42+
const terms = this.tokenize(content);
43+
const termFreq = this.computeTermFrequency(terms);
44+
const url = manifestMap.get(file) || this.filepathToUrl(file);
45+
46+
this.docs.push({
47+
filepath: relative(this.docsDir, file),
48+
url,
49+
title,
50+
content,
51+
terms: termFreq,
52+
termCount: terms.length,
53+
});
54+
}
55+
56+
this.computeIDF();
57+
this.loaded = true;
58+
console.error(`Docs index loaded: ${this.docs.length} documents from ${this.docsDir}`);
59+
}
60+
61+
search(query: string, maxResults = 5): Array<{ url: string; title: string; snippet: string; score: number; filepath: string }> {
62+
const queryTerms = this.tokenize(query);
63+
if (!queryTerms.length) return [];
64+
65+
const scores: Array<{ doc: IndexedDoc; score: number }> = [];
66+
67+
for (const doc of this.docs) {
68+
let score = 0;
69+
for (const term of queryTerms) {
70+
const tf = (doc.terms.get(term) || 0) / Math.max(doc.termCount, 1);
71+
const idf = this.idf.get(term) || 0;
72+
score += tf * idf;
73+
}
74+
75+
const titleBonus = queryTerms.some((t) => doc.title.toLowerCase().includes(t)) ? 2 : 1;
76+
score *= titleBonus;
77+
78+
if (score > 0) {
79+
scores.push({ doc, score });
80+
}
81+
}
82+
83+
scores.sort((a, b) => b.score - a.score);
84+
85+
return scores.slice(0, maxResults).map(({ doc, score }) => ({
86+
url: doc.url,
87+
title: doc.title,
88+
snippet: this.extractSnippet(doc.content, queryTerms),
89+
score: Math.round(score * 10000) / 10000,
90+
filepath: doc.filepath,
91+
}));
92+
}
93+
94+
getDocContent(filepath: string): string | null {
95+
const doc = this.docs.find((d) => d.filepath === filepath);
96+
return doc?.content ?? null;
97+
}
98+
99+
get documentCount(): number {
100+
return this.docs.length;
101+
}
102+
103+
get isLoaded(): boolean {
104+
return this.loaded;
105+
}
106+
107+
private tokenize(text: string): string[] {
108+
return text
109+
.toLowerCase()
110+
.replace(/[^\p{L}\p{N}\s]/gu, " ")
111+
.split(/\s+/)
112+
.filter((t) => t.length > 2);
113+
}
114+
115+
private computeTermFrequency(terms: string[]): Map<string, number> {
116+
const freq = new Map<string, number>();
117+
for (const term of terms) {
118+
freq.set(term, (freq.get(term) || 0) + 1);
119+
}
120+
return freq;
121+
}
122+
123+
private computeIDF(): void {
124+
const docFreq = new Map<string, number>();
125+
for (const doc of this.docs) {
126+
for (const term of doc.terms.keys()) {
127+
docFreq.set(term, (docFreq.get(term) || 0) + 1);
128+
}
129+
}
130+
131+
const n = this.docs.length;
132+
for (const [term, df] of docFreq) {
133+
this.idf.set(term, Math.log(1 + n / df));
134+
}
135+
}
136+
137+
private extractTitle(content: string): string {
138+
const match = content.match(/^#\s+(.+)$/m);
139+
return match?.[1]?.trim() || "Untitled";
140+
}
141+
142+
private extractSnippet(content: string, queryTerms: string[]): string {
143+
const lines = content.split("\n").filter((l) => l.trim());
144+
const lower = queryTerms.map((t) => t.toLowerCase());
145+
146+
for (const line of lines) {
147+
if (lower.some((t) => line.toLowerCase().includes(t))) {
148+
return line.slice(0, 300);
149+
}
150+
}
151+
152+
return lines.slice(0, 3).join(" ").slice(0, 300);
153+
}
154+
155+
private async findMarkdownFiles(dir: string): Promise<string[]> {
156+
const results: string[] = [];
157+
try {
158+
const entries = await readdir(dir, { withFileTypes: true });
159+
for (const entry of entries) {
160+
const fullPath = join(dir, entry.name);
161+
if (entry.isDirectory()) {
162+
results.push(...(await this.findMarkdownFiles(fullPath)));
163+
} else if (entry.name.endsWith(".md")) {
164+
results.push(fullPath);
165+
}
166+
}
167+
} catch (_e) { /* directory may not exist */ }
168+
return results;
169+
}
170+
171+
private filepathToUrl(filepath: string): string {
172+
const rel = relative(this.docsDir, filepath).replace(/\.md$/, "");
173+
return `https://dev.magalu.com/docs/${rel}`;
174+
}
175+
}

packages/mgc/src/server.ts

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import {
1717
DbListParamsSchema,
1818
BlockStorageListParamsSchema,
1919
BlockStorageCreateParamsSchema,
20+
SearchDocsParamsSchema,
21+
GetDocParamsSchema,
2022
} from "./types.js";
2123

2224
export class MgcMCPServer {
@@ -30,7 +32,8 @@ export class MgcMCPServer {
3032
});
3133

3234
const client = new MgcClient(mgcPath);
33-
this.tools = new MgcTools(client);
35+
const docsDir = process.env.MAGALU_DOCS_DIR;
36+
this.tools = new MgcTools(client, docsDir);
3437

3538
this.setupTools();
3639
}
@@ -384,6 +387,34 @@ export class MgcMCPServer {
384387
return this.tools.blockStorageVolumeCreate(validated);
385388
}
386389
);
390+
391+
this.server.registerTool(
392+
"search_magalu_docs",
393+
{
394+
title: "Search Magalu Documentation",
395+
description:
396+
"Semantic search across Magalu Cloud developer documentation. Returns relevant doc pages with snippets and links. Requires MAGALU_DOCS_DIR env var pointing to scraped docs.",
397+
inputSchema: SearchDocsParamsSchema.shape,
398+
},
399+
async (params) => {
400+
const validated = SearchDocsParamsSchema.parse(params);
401+
return this.tools.searchDocs(validated);
402+
}
403+
);
404+
405+
this.server.registerTool(
406+
"get_magalu_doc",
407+
{
408+
title: "Get Magalu Doc Content",
409+
description:
410+
"Get the full markdown content of a specific Magalu documentation page by filepath. Use search_magalu_docs first to find the filepath.",
411+
inputSchema: GetDocParamsSchema.shape,
412+
},
413+
async (params) => {
414+
const validated = GetDocParamsSchema.parse(params);
415+
return this.tools.getDoc(validated);
416+
}
417+
);
387418
}
388419

389420
async start(): Promise<void> {

packages/mgc/src/tools.ts

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { MgcClient } from "./mgc-client.js";
2+
import { DocsIndex } from "./docs-index.js";
23
import {
34
MgcExecuteParams,
45
VmListParams,
@@ -14,11 +15,19 @@ import {
1415
DbListParams,
1516
BlockStorageListParams,
1617
BlockStorageCreateParams,
18+
SearchDocsParams,
19+
GetDocParams,
1720
McpToolResult,
1821
} from "./types.js";
1922

2023
export class MgcTools {
21-
constructor(private client: MgcClient) {}
24+
private docsIndex: DocsIndex | null = null;
25+
26+
constructor(private client: MgcClient, docsDir?: string) {
27+
if (docsDir) {
28+
this.docsIndex = new DocsIndex(docsDir);
29+
}
30+
}
2231

2332
private formatResult(
2433
stdout: string,
@@ -297,4 +306,62 @@ export class MgcTools {
297306
);
298307
return this.formatResult(result.stdout, result.stderr, result.exitCode);
299308
}
309+
310+
async searchDocs(params: SearchDocsParams): Promise<McpToolResult> {
311+
if (!this.docsIndex) {
312+
return {
313+
content: [{ type: "text", text: JSON.stringify({ error: "Docs index not configured. Set MAGALU_DOCS_DIR env var." }, null, 2) }],
314+
isError: true,
315+
};
316+
}
317+
318+
try {
319+
await this.docsIndex.load();
320+
const results = this.docsIndex.search(params.query, params.max_results);
321+
322+
return {
323+
content: [{
324+
type: "text",
325+
text: JSON.stringify({
326+
query: params.query,
327+
totalIndexed: this.docsIndex.documentCount,
328+
results,
329+
}, null, 2),
330+
}],
331+
};
332+
} catch (error) {
333+
return {
334+
content: [{ type: "text", text: JSON.stringify({ error: error instanceof Error ? error.message : String(error) }, null, 2) }],
335+
isError: true,
336+
};
337+
}
338+
}
339+
340+
async getDoc(params: GetDocParams): Promise<McpToolResult> {
341+
if (!this.docsIndex) {
342+
return {
343+
content: [{ type: "text", text: JSON.stringify({ error: "Docs index not configured. Set MAGALU_DOCS_DIR env var." }, null, 2) }],
344+
isError: true,
345+
};
346+
}
347+
348+
try {
349+
await this.docsIndex.load();
350+
const content = this.docsIndex.getDocContent(params.filepath);
351+
352+
if (!content) {
353+
return {
354+
content: [{ type: "text", text: JSON.stringify({ error: "Document not found", filepath: params.filepath }, null, 2) }],
355+
isError: true,
356+
};
357+
}
358+
359+
return { content: [{ type: "text", text: content }] };
360+
} catch (error) {
361+
return {
362+
content: [{ type: "text", text: JSON.stringify({ error: error instanceof Error ? error.message : String(error) }, null, 2) }],
363+
isError: true,
364+
};
365+
}
366+
}
300367
}

packages/mgc/src/types.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,15 @@ export const BlockStorageCreateParamsSchema = z.object({
121121
type: z.string().optional().describe("Volume type"),
122122
});
123123

124+
export const SearchDocsParamsSchema = z.object({
125+
query: z.string().min(1, "Query is required").describe("Search query in natural language to find relevant Magalu documentation"),
126+
max_results: z.number().int().positive().optional().default(5).describe("Maximum number of results to return (default: 5)"),
127+
});
128+
129+
export const GetDocParamsSchema = z.object({
130+
filepath: z.string().min(1, "Filepath is required").describe("Relative filepath of the document (from search_magalu_docs results)"),
131+
});
132+
124133
export type MgcExecuteParams = z.infer<typeof MgcExecuteParamsSchema>;
125134
export type VmListParams = z.infer<typeof VmListParamsSchema>;
126135
export type VmCreateParams = z.infer<typeof VmCreateParamsSchema>;
@@ -135,6 +144,8 @@ export type KubernetesClusterCreateParams = z.infer<typeof KubernetesClusterCrea
135144
export type DbListParams = z.infer<typeof DbListParamsSchema>;
136145
export type BlockStorageListParams = z.infer<typeof BlockStorageListParamsSchema>;
137146
export type BlockStorageCreateParams = z.infer<typeof BlockStorageCreateParamsSchema>;
147+
export type SearchDocsParams = z.infer<typeof SearchDocsParamsSchema>;
148+
export type GetDocParams = z.infer<typeof GetDocParamsSchema>;
138149

139150
export interface McpToolResult {
140151
[key: string]: unknown;

0 commit comments

Comments
 (0)