Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ jobs:
cd crates/opencontext-node
npm ci
npm run build:debug


- name: Use local native build for tests
run: npm uninstall @aicontextlab/core-native

- name: Run JS tests
run: npm test

Expand Down
50 changes: 47 additions & 3 deletions bin/oc.js
Original file line number Diff line number Diff line change
Expand Up @@ -521,15 +521,29 @@ program
.action(
handle((folderPath, options) => {
const effectiveFolder = folderPath === undefined ? '.' : folderPath;
const limit = options.limit !== undefined ? Number(options.limit) : null;
const rows = store.generateManifest({ folderPath: effectiveFolder, limit });
const limit = options.limit !== undefined ? Number(options.limit) : undefined;
const result = store.generateManifest({ folderPath: effectiveFolder, limit });
const rows = result.items;
const unindexed = result.unindexed_files || [];
const format = options.llm ? 'llm' : (options.format || 'json');
if (format === 'json') {
console.log(JSON.stringify(rows, null, 2));
console.log(JSON.stringify(result, null, 2));
if (unindexed.length > 0) {
console.error(
`\n⚠️ ${unindexed.length} files exist on disk but are not indexed. ` +
`Run \`oc index reconcile ${effectiveFolder}\` to register them.`
);
}
return;
}
if (format === 'llm') {
console.log(renderManifestLlm({ folderPath: effectiveFolder, limit, rows }));
if (unindexed.length > 0) {
console.error(
`\n⚠️ ${unindexed.length} unindexed files: ${unindexed.join(', ')}\n` +
` Run \`oc index reconcile ${effectiveFolder}\`.`
);
}
return;
}
throw new Error(`Unknown format "${format}". Supported: json, llm`);
Expand Down Expand Up @@ -694,6 +708,36 @@ indexCmd
})
);

indexCmd
.command('reconcile')
.argument('[folder]', 'Folder to reconcile (default: all root folders)')
.description('Register *.md files that exist on disk but are missing from the SQLite index. Does NOT rebuild embeddings (use `oc index build` for that).')
.action(
handle((folder) => {
const folders = folder
? [folder]
: store.listFolders({ all: false }).map((f) => f.rel_path);

let total = 0;
for (const f of folders) {
const added = store.reconcileFolder({ folderPath: f });
if (added.length === 0) {
console.log(`✅ ${f}: nothing to reconcile`);
} else {
console.log(`📥 ${f}: registered ${added.length} file(s)`);
for (const rel of added) {
console.log(` + ${rel}`);
}
}
total += added.length;
}
if (total > 0) {
console.log(`\nDone. ${total} doc(s) added to the index.`);
console.log('Run `oc index build` to (re)compute embeddings.');
}
})
);

indexCmd
.command('clean')
.description('Clean/reset the search index completely')
Expand Down
194 changes: 184 additions & 10 deletions crates/opencontext-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@ mod tests;
use chrono::{SecondsFormat, Utc};
use parking_lot::Mutex;
use rusqlite::{params, Connection, OptionalExtension};
use std::{env, fs, path::PathBuf, sync::Arc};
use std::{
env, fs,
path::{Path, PathBuf},
sync::Arc,
};
use thiserror::Error;

// Events module (enabled with "search" feature)
Expand Down Expand Up @@ -89,6 +93,16 @@ pub struct DocManifestEntry {
pub updated_at: String,
}

/// Manifest response that also surfaces filesystem files which are NOT
/// registered in SQLite (i.e. created via `Write`/`Edit` bypassing the API).
/// `unindexed_files` is the list of relative paths (under the requested
/// folder) of `*.md` files that exist on disk but have no `docs` row.
#[derive(Debug, Clone, serde::Serialize)]
pub struct ManifestResult {
pub items: Vec<DocManifestEntry>,
pub unindexed_files: Vec<String>,
}

impl OpenContext {
pub fn initialize(overrides: EnvOverrides) -> CoreResult<Self> {
let base_root = overrides
Expand Down Expand Up @@ -317,7 +331,7 @@ impl OpenContext {
if parent.is_empty() {
new_name.to_string()
} else {
format!("{}/{}", parent, new_name)
format!("{parent}/{new_name}")
}
} else {
new_name.to_string()
Expand Down Expand Up @@ -361,7 +375,7 @@ impl OpenContext {
.collect::<Result<Vec<_>, _>>()?;
for (id, child_rel) in folder_rows {
let suffix = &child_rel[folder.rel_path.len() + 1..];
let updated_rel = format!("{}/{}", new_rel_path, suffix);
let updated_rel = format!("{new_rel_path}/{suffix}");
let updated_abs = self.contexts_root.join(&updated_rel);
tx.execute(
"UPDATE folders SET rel_path = ?1, abs_path = ?2, updated_at = ?3 WHERE id = ?4",
Expand All @@ -377,7 +391,7 @@ impl OpenContext {
.collect::<Result<Vec<_>, _>>()?;
for (id, doc_rel) in doc_rows {
let suffix = &doc_rel[folder.rel_path.len() + 1..];
let updated_rel = format!("{}/{}", new_rel_path, suffix);
let updated_rel = format!("{new_rel_path}/{suffix}");
let updated_abs = self.contexts_root.join(&updated_rel);
tx.execute(
"UPDATE docs SET rel_path = ?1, abs_path = ?2, updated_at = ?3 WHERE id = ?4",
Expand Down Expand Up @@ -426,7 +440,7 @@ impl OpenContext {
"Root is not supported. Please move into a folder under contexts/.".into(),
));
}
if dest_rel_folder == rel_path || dest_rel_folder.starts_with(&format!("{}/", rel_path)) {
if dest_rel_folder == rel_path || dest_rel_folder.starts_with(&format!("{rel_path}/")) {
return Err(CoreError::Message(
"Cannot move a folder into itself or its descendants.".into(),
));
Expand Down Expand Up @@ -492,7 +506,7 @@ impl OpenContext {
.collect::<Result<Vec<_>, _>>()?;
for (id, child_rel) in folder_rows {
let suffix = &child_rel[folder.rel_path.len() + 1..];
let updated_rel = format!("{}/{}", new_rel_path, suffix);
let updated_rel = format!("{new_rel_path}/{suffix}");
let updated_abs = self.contexts_root.join(&updated_rel);
tx.execute(
"UPDATE folders SET rel_path = ?1, abs_path = ?2, updated_at = ?3 WHERE id = ?4",
Expand All @@ -509,7 +523,7 @@ impl OpenContext {
.collect::<Result<Vec<_>, _>>()?;
for (id, doc_rel) in doc_rows {
let suffix = &doc_rel[folder.rel_path.len() + 1..];
let updated_rel = format!("{}/{}", new_rel_path, suffix);
let updated_rel = format!("{new_rel_path}/{suffix}");
let updated_abs = self.contexts_root.join(&updated_rel);
tx.execute(
"UPDATE docs SET rel_path = ?1, abs_path = ?2, updated_at = ?3 WHERE id = ?4",
Expand Down Expand Up @@ -586,7 +600,7 @@ impl OpenContext {
"Folder \"{rel_path}\" is not empty. Use --force to delete recursively."
)));
}
let like_pattern = format!("{}/%", rel_path);
let like_pattern = format!("{rel_path}/%");
let tx = conn.unchecked_transaction()?;
tx.execute(
"DELETE FROM docs WHERE rel_path LIKE ?1",
Expand Down Expand Up @@ -792,7 +806,7 @@ impl OpenContext {
let folder_rel = parent_rel_path(&doc.rel_path);
let new_rel_path = folder_rel
.and_then(|p| if p.is_empty() { None } else { Some(p) })
.map(|prefix| format!("{}/{}", prefix, new_name))
.map(|prefix| format!("{prefix}/{new_name}"))
.unwrap_or_else(|| new_name.to_string());
if self.find_doc(&new_rel_path)?.is_some() {
return Err(CoreError::Message(format!(
Expand Down Expand Up @@ -971,6 +985,132 @@ impl OpenContext {
})
}

/// Like `generate_manifest`, but also scans the filesystem and returns
/// any `*.md` files that exist on disk under the folder but are NOT
/// registered in the `docs` table. Manifest itself remains read-only —
/// nothing is inserted; the caller is expected to surface a warning
/// and (optionally) invoke `reconcile_folder` to fix the drift.
pub fn generate_manifest_full(
&self,
folder_path: &str,
limit: Option<usize>,
) -> CoreResult<ManifestResult> {
let items = self.generate_manifest(folder_path, limit)?;

let rel_path = normalize_folder_path(Some(folder_path))?;
let folder = self
.find_folder(&rel_path)?
.ok_or_else(|| folder_not_found(&rel_path))?;

// Collect rel_paths of every *.md file under the folder.
let mut on_disk: Vec<String> = Vec::new();
if folder.abs_path.is_dir() {
scan_md_files(&folder.abs_path, &self.contexts_root, &mut on_disk)?;
}

// Subtract everything that is already in the DB for this folder
// (use a fresh full SELECT — `items` may be limited).
let known: std::collections::HashSet<String> = self.with_conn(|conn| {
let pattern = if folder.rel_path.is_empty() {
"%".to_string()
} else {
format!("{}/%", folder.rel_path)
};
let mut stmt = conn.prepare("SELECT rel_path FROM docs WHERE rel_path LIKE ?1")?;
let rows = stmt
.query_map([pattern], |row| row.get::<_, String>(0))?
.collect::<Result<Vec<_>, _>>()?;
Ok(rows.into_iter().collect())
})?;

let mut unindexed_files: Vec<String> =
on_disk.into_iter().filter(|p| !known.contains(p)).collect();
unindexed_files.sort();

Ok(ManifestResult {
items,
unindexed_files,
})
}

/// Walk the filesystem under `folder_path` and register any `*.md`
/// files that are not yet present in the `docs` table. Does NOT touch
/// embeddings / LanceDB — that's a separate (slow) step via
/// `oc index build`. Returns the list of newly registered rel_paths.
pub fn reconcile_folder(&self, folder_path: &str) -> CoreResult<Vec<String>> {
let rel_path = normalize_folder_path(Some(folder_path))?;
let folder = self
.find_folder(&rel_path)?
.ok_or_else(|| folder_not_found(&rel_path))?;

let mut on_disk: Vec<String> = Vec::new();
if folder.abs_path.is_dir() {
scan_md_files(&folder.abs_path, &self.contexts_root, &mut on_disk)?;
}

let known: std::collections::HashSet<String> = self.with_conn(|conn| {
let pattern = if folder.rel_path.is_empty() {
"%".to_string()
} else {
format!("{}/%", folder.rel_path)
};
let mut stmt = conn.prepare("SELECT rel_path FROM docs WHERE rel_path LIKE ?1")?;
let rows = stmt
.query_map([pattern], |row| row.get::<_, String>(0))?
.collect::<Result<Vec<_>, _>>()?;
Ok(rows.into_iter().collect())
})?;

let mut added: Vec<String> = Vec::new();
for doc_rel in on_disk {
if known.contains(&doc_rel) {
continue;
}
let parent_rel = parent_rel_path(&doc_rel).unwrap_or_default();
if parent_rel.is_empty() {
// Root-level docs are not representable in the schema
// (folders.rel_path = '' has no row). Skip silently.
continue;
}
let parent_folder = self
.ensure_folder_record(&parent_rel)?
.ok_or_else(|| folder_not_found(&parent_rel))?;
let name = doc_rel
.split('/')
.next_back()
.unwrap_or(&doc_rel)
.to_string();
let abs_path = self.contexts_root.join(&doc_rel);
let ts = now_iso();
self.with_conn(|conn| {
let sid = generate_stable_id(conn)?;
conn.execute(
"INSERT INTO docs (folder_id, name, rel_path, abs_path, description, stable_id, created_at, updated_at)
VALUES (?1, ?2, ?3, ?4, '', ?5, ?6, ?6)",
params![
parent_folder.id,
name,
doc_rel,
abs_path.to_string_lossy(),
sid,
ts
],
)?;
Ok(())
})?;

#[cfg(feature = "search")]
self.emit_doc_event(DocEvent::Created {
rel_path: doc_rel.clone(),
});

added.push(doc_rel);
}

added.sort();
Ok(added)
}

fn find_folder(&self, rel_path: &str) -> CoreResult<Option<Folder>> {
self.with_conn(|conn| {
let mut stmt = conn.prepare(
Expand Down Expand Up @@ -1135,6 +1275,40 @@ fn parent_rel_path(rel_path: &str) -> Option<String> {
}
}

/// Recursively walk `dir` and append rel_paths (relative to `contexts_root`)
/// of every `*.md` file. Hidden directories (starting with `.`) and any
/// non-utf8 paths are skipped.
fn scan_md_files(dir: &Path, contexts_root: &Path, out: &mut Vec<String>) -> CoreResult<()> {
for entry in fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
let file_name = match entry.file_name().into_string() {
Ok(n) => n,
Err(_) => continue,
};
if file_name.starts_with('.') {
continue;
}
let file_type = entry.file_type()?;
if file_type.is_dir() {
scan_md_files(&path, contexts_root, out)?;
} else if file_type.is_file()
&& path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.eq_ignore_ascii_case("md"))
.unwrap_or(false)
{
if let Ok(rel) = path.strip_prefix(contexts_root) {
if let Some(s) = rel.to_str() {
out.push(s.replace('\\', "/"));
}
}
}
}
Ok(())
}

fn folder_not_found(rel_path: &str) -> CoreError {
CoreError::Message(format!(
"Folder \"{rel_path}\" does not exist. Use \"oc folder create {rel_path}\" first."
Expand Down Expand Up @@ -1224,7 +1398,7 @@ fn generate_stable_id(conn: &Connection) -> CoreResult<String> {
// UUID v4 variant/version bits
b[6] = (b[6] & 0x0f) | 0x40;
b[8] = (b[8] & 0x3f) | 0x80;
let hex = b.iter().map(|v| format!("{:02x}", v)).collect::<String>();
let hex = b.iter().map(|v| format!("{v:02x}")).collect::<String>();
Ok(format!(
"{}-{}-{}-{}-{}",
&hex[0..8],
Expand Down
Loading
Loading