Skip to content

Commit c845d67

Browse files
committed
Adding url_rewrite_prefix option
1 parent 7756271 commit c845d67

File tree

4 files changed

+201
-98
lines changed

4 files changed

+201
-98
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Configuration is managed through two files:
8787
* `include_extensions`: (Optional) Array of file extensions to include (e.g., `['.md', '.txt']`). Defaults to `['.md', '.txt', '.html', '.htm']`.
8888
* `exclude_extensions`: (Optional) Array of file extensions to exclude.
8989
* `recursive`: (Optional) Whether to traverse subdirectories (defaults to `true`).
90+
* `url_rewrite_prefix` (Optional) URL prefix to rewrite `file://` URLs (e.g., `https://mydomain.com`)
9091
* `encoding`: (Optional) File encoding to use (defaults to `'utf8'`).
9192

9293
Common configuration for all types:

doc2vec.ts

100644100755
+186-93
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env ts-node
2+
13
import { Readability } from '@mozilla/readability';
24
import axios from 'axios';
35
import { load } from 'cheerio';
@@ -41,6 +43,7 @@ interface LocalDirectorySourceConfig extends BaseSourceConfig {
4143
exclude_extensions?: string[]; // File extensions to exclude
4244
recursive?: boolean; // Whether to traverse subdirectories
4345
encoding?: BufferEncoding; // File encoding (default: 'utf8')
46+
url_rewrite_prefix?: string; // Optional URL prefix to rewrite file:// URLs (e.g., 'https://mydomain.com')
4447
}
4548

4649
// Configuration specific to website sources
@@ -723,8 +726,33 @@ class Doc2Vec {
723726

724727
logger.info(`Processing content from ${filePath} (${content.length} chars)`);
725728
try {
726-
// Use the file path as URL for tracking
727-
const fileUrl = `file://${filePath}`;
729+
// Generate URL based on configuration
730+
let fileUrl: string;
731+
732+
if (config.url_rewrite_prefix) {
733+
// Replace local path with URL prefix
734+
const relativePath = path.relative(config.path, filePath).replace(/\\/g, '/');
735+
736+
// If relativePath starts with '..', it means the file is outside the base directory
737+
if (relativePath.startsWith('..')) {
738+
// For files outside the configured path, use the default file:// scheme
739+
fileUrl = `file://${filePath}`;
740+
logger.debug(`File outside configured path, using default URL: ${fileUrl}`);
741+
} else {
742+
// For files inside the configured path, rewrite the URL
743+
// Handle trailing slashes in the URL prefix to avoid double slashes
744+
const prefix = config.url_rewrite_prefix.endsWith('/')
745+
? config.url_rewrite_prefix.slice(0, -1)
746+
: config.url_rewrite_prefix;
747+
748+
fileUrl = `${prefix}/${relativePath}`;
749+
logger.debug(`URL rewritten: ${filePath} -> ${fileUrl}`);
750+
}
751+
} else {
752+
// Use default file:// URL
753+
fileUrl = `file://${filePath}`;
754+
}
755+
728756
const chunks = await this.chunkMarkdown(content, config, fileUrl);
729757
logger.info(`Created ${chunks.length} chunks`);
730758

@@ -804,15 +832,13 @@ class Doc2Vec {
804832
logger
805833
);
806834

807-
logger.info(`Found ${validChunkIds.size} valid chunks across processed files for ${config.path}`);
808-
809835
logger.section('CLEANUP');
810836
if (dbConnection.type === 'sqlite') {
811837
logger.info(`Running SQLite cleanup for local directory ${config.path}`);
812-
this.removeObsoleteFilesSQLite(dbConnection.db, processedFiles, config.path, logger);
838+
this.removeObsoleteFilesSQLite(dbConnection.db, processedFiles, config, logger);
813839
} else if (dbConnection.type === 'qdrant') {
814840
logger.info(`Running Qdrant cleanup for local directory ${config.path} in collection ${dbConnection.collectionName}`);
815-
await this.removeObsoleteFilesQdrant(dbConnection, processedFiles, config.path, logger);
841+
await this.removeObsoleteFilesQdrant(dbConnection, processedFiles, config, logger);
816842
}
817843

818844
logger.info(`Finished processing local directory: ${config.path}`);
@@ -919,6 +945,160 @@ class Doc2Vec {
919945
}
920946
}
921947

948+
private removeObsoleteFilesSQLite(
949+
db: Database,
950+
processedFiles: Set<string>,
951+
pathConfig: { path: string; url_rewrite_prefix?: string } | string,
952+
logger: Logger
953+
) {
954+
const getChunksForPathStmt = db.prepare(`
955+
SELECT chunk_id, url FROM vec_items
956+
WHERE url LIKE ? || '%'
957+
`);
958+
const deleteChunkStmt = db.prepare(`DELETE FROM vec_items WHERE chunk_id = ?`);
959+
960+
// Determine if we're using URL rewriting or direct file paths
961+
const isRewriteMode = typeof pathConfig === 'object' && pathConfig.url_rewrite_prefix;
962+
963+
// Set up the URL prefix for searching
964+
let urlPrefix: string;
965+
if (isRewriteMode) {
966+
// Handle URL rewriting case
967+
urlPrefix = (pathConfig as { path: string; url_rewrite_prefix?: string }).url_rewrite_prefix || '';
968+
urlPrefix = urlPrefix.endsWith('/') ? urlPrefix.slice(0, -1) : urlPrefix;
969+
} else {
970+
// Handle direct file path case
971+
const dirPrefix = typeof pathConfig === 'string' ? pathConfig : pathConfig.path;
972+
const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, '');
973+
urlPrefix = `file://${cleanedDirPrefix}`;
974+
}
975+
976+
logger.debug(`Searching for chunks with URL prefix: ${urlPrefix}`);
977+
const existingChunks = getChunksForPathStmt.all(urlPrefix) as { chunk_id: string; url: string }[];
978+
let deletedCount = 0;
979+
980+
const transaction = db.transaction(() => {
981+
for (const { chunk_id, url } of existingChunks) {
982+
// Skip if it's not from our URL prefix (safety check)
983+
if (!url.startsWith(urlPrefix)) continue;
984+
985+
let filePath: string;
986+
let shouldDelete = false;
987+
988+
if (isRewriteMode) {
989+
// URL rewrite mode: extract relative path and construct full file path
990+
const config = pathConfig as { path: string; url_rewrite_prefix?: string };
991+
const relativePath = url.substring(urlPrefix.length + 1); // +1 for the '/'
992+
filePath = path.join(config.path, relativePath);
993+
shouldDelete = !processedFiles.has(filePath);
994+
} else {
995+
// Direct file path mode: remove file:// prefix to match with processedFiles
996+
filePath = url.substring(7); // Remove 'file://' prefix
997+
shouldDelete = !processedFiles.has(filePath);
998+
}
999+
1000+
if (shouldDelete) {
1001+
logger.debug(`Deleting obsolete chunk from SQLite: ${chunk_id.substring(0, 8)}... (File not processed: ${filePath})`);
1002+
deleteChunkStmt.run(chunk_id);
1003+
deletedCount++;
1004+
}
1005+
}
1006+
});
1007+
transaction();
1008+
1009+
logger.info(`Deleted ${deletedCount} obsolete chunks from SQLite for URL prefix ${urlPrefix}`);
1010+
}
1011+
1012+
private async removeObsoleteFilesQdrant(
1013+
db: QdrantDB,
1014+
processedFiles: Set<string>,
1015+
pathConfig: { path: string; url_rewrite_prefix?: string } | string,
1016+
logger: Logger
1017+
) {
1018+
const { client, collectionName } = db;
1019+
try {
1020+
// Determine if we're using URL rewriting or direct file paths
1021+
const isRewriteMode = typeof pathConfig === 'object' && pathConfig.url_rewrite_prefix;
1022+
1023+
// Set up the URL prefix for searching
1024+
let urlPrefix: string;
1025+
if (isRewriteMode) {
1026+
// Handle URL rewriting case
1027+
urlPrefix = (pathConfig as { path: string; url_rewrite_prefix?: string }).url_rewrite_prefix || '';
1028+
urlPrefix = urlPrefix.endsWith('/') ? urlPrefix.slice(0, -1) : urlPrefix;
1029+
} else {
1030+
// Handle direct file path case
1031+
const dirPrefix = typeof pathConfig === 'string' ? pathConfig : pathConfig.path;
1032+
const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, '');
1033+
urlPrefix = `file://${cleanedDirPrefix}`;
1034+
}
1035+
1036+
logger.debug(`Checking for obsolete chunks with URL prefix: ${urlPrefix}`);
1037+
const response = await client.scroll(collectionName, {
1038+
limit: 10000,
1039+
with_payload: true,
1040+
with_vector: false,
1041+
filter: {
1042+
must: [
1043+
{
1044+
key: "url",
1045+
match: {
1046+
text: urlPrefix
1047+
}
1048+
}
1049+
],
1050+
must_not: [
1051+
{
1052+
key: "is_metadata",
1053+
match: {
1054+
value: true
1055+
}
1056+
}
1057+
]
1058+
}
1059+
});
1060+
1061+
const obsoletePointIds = response.points
1062+
.filter((point: any) => {
1063+
const url = point.payload?.url;
1064+
// Double check it's not a metadata record
1065+
if (point.payload?.is_metadata === true) {
1066+
return false;
1067+
}
1068+
1069+
if (!url || !url.startsWith(urlPrefix)) {
1070+
return false;
1071+
}
1072+
1073+
let filePath: string;
1074+
1075+
if (isRewriteMode) {
1076+
// URL rewrite mode: extract relative path and construct full file path
1077+
const config = pathConfig as { path: string; url_rewrite_prefix?: string };
1078+
const relativePath = url.substring(urlPrefix.length + 1); // +1 for the '/'
1079+
filePath = path.join(config.path, relativePath);
1080+
} else {
1081+
// Direct file path mode: remove file:// prefix to match with processedFiles
1082+
filePath = url.startsWith('file://') ? url.substring(7) : '';
1083+
}
1084+
1085+
return filePath && !processedFiles.has(filePath);
1086+
})
1087+
.map((point: any) => point.id);
1088+
1089+
if (obsoletePointIds.length > 0) {
1090+
await client.delete(collectionName, {
1091+
points: obsoletePointIds,
1092+
});
1093+
logger.info(`Deleted ${obsoletePointIds.length} obsolete chunks from Qdrant for URL prefix ${urlPrefix}`);
1094+
} else {
1095+
logger.info(`No obsolete chunks to delete from Qdrant for URL prefix ${urlPrefix}`);
1096+
}
1097+
} catch (error) {
1098+
logger.error(`Error removing obsolete chunks from Qdrant:`, error);
1099+
}
1100+
}
1101+
9221102
private async initDatabase(config: SourceConfig, parentLogger: Logger): Promise<DatabaseConnection> {
9231103
const logger = parentLogger.child('database');
9241104
const dbConfig = config.database_config;
@@ -1114,93 +1294,6 @@ class Doc2Vec {
11141294

11151295
logger.info(`Deleted ${deletedCount} obsolete chunks from SQLite for URL ${urlPrefix}`);
11161296
}
1117-
1118-
private removeObsoleteFilesSQLite(db: Database, processedFiles: Set<string>, dirPrefix: string, logger: Logger) {
1119-
const getChunksForDirStmt = db.prepare(`
1120-
SELECT chunk_id, url FROM vec_items
1121-
WHERE url LIKE 'file://%' AND url LIKE ?
1122-
`);
1123-
const deleteChunkStmt = db.prepare(`DELETE FROM vec_items WHERE chunk_id = ?`);
1124-
1125-
const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, '');
1126-
const filePrefix = `file://${cleanedDirPrefix}`;
1127-
1128-
const existingChunks = getChunksForDirStmt.all(`${filePrefix}%`) as { chunk_id: string; url: string }[];
1129-
let deletedCount = 0;
1130-
1131-
const transaction = db.transaction(() => {
1132-
for (const { chunk_id, url } of existingChunks) {
1133-
// Remove file:// prefix to match with processedFiles
1134-
const filePath = url.substring(7);
1135-
if (!processedFiles.has(filePath)) {
1136-
logger.debug(`Deleting obsolete chunk from SQLite: ${chunk_id.substring(0, 8)}... (File not processed)`);
1137-
deleteChunkStmt.run(chunk_id);
1138-
deletedCount++;
1139-
}
1140-
}
1141-
});
1142-
transaction();
1143-
1144-
logger.info(`Deleted ${deletedCount} obsolete chunks from SQLite for directory ${dirPrefix}`);
1145-
}
1146-
1147-
private async removeObsoleteFilesQdrant(db: QdrantDB, processedFiles: Set<string>, dirPrefix: string, logger: Logger) {
1148-
const { client, collectionName } = db;
1149-
try {
1150-
1151-
const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, '');
1152-
const filePrefix = `file://${cleanedDirPrefix}`;
1153-
1154-
// Get all points that match the file prefix but are not metadata points
1155-
const response = await client.scroll(collectionName, {
1156-
limit: 10000,
1157-
with_payload: true,
1158-
with_vector: false,
1159-
filter: {
1160-
must: [
1161-
{
1162-
key: "url",
1163-
match: {
1164-
text: `${filePrefix}`
1165-
}
1166-
}
1167-
],
1168-
must_not: [
1169-
{
1170-
key: "is_metadata",
1171-
match: {
1172-
value: true
1173-
}
1174-
}
1175-
]
1176-
}
1177-
});
1178-
1179-
const obsoletePointIds = response.points
1180-
.filter((point: any) => {
1181-
const url = point.payload?.url;
1182-
// Double check it's not a metadata record
1183-
if (point.payload?.is_metadata === true) {
1184-
return false;
1185-
}
1186-
// Remove file:// prefix to match with processedFiles
1187-
const filePath = url && url.startsWith('file://') ? url.substring(7) : '';
1188-
return filePath && !processedFiles.has(filePath);
1189-
})
1190-
.map((point: any) => point.id);
1191-
1192-
if (obsoletePointIds.length > 0) {
1193-
await client.delete(collectionName, {
1194-
points: obsoletePointIds,
1195-
});
1196-
logger.info(`Deleted ${obsoletePointIds.length} obsolete chunks from Qdrant for directory ${dirPrefix}`);
1197-
} else {
1198-
logger.info(`No obsolete chunks to delete from Qdrant for directory ${dirPrefix}`);
1199-
}
1200-
} catch (error) {
1201-
logger.error(`Error removing obsolete chunks from Qdrant:`, error);
1202-
}
1203-
}
12041297

12051298
private isValidUuid(str: string): boolean {
12061299
const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i;

package.json

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,19 @@
11
{
22
"name": "doc2vec",
33
"version": "1.0.0",
4+
"type": "commonjs",
45
"description": "",
56
"main": "doc2vec.ts",
7+
"private": false,
8+
"publishConfig": {
9+
"access": "public"
10+
},
11+
"bin": {
12+
"doc2vec": "dist/doc2vec.js"
13+
},
614
"scripts": {
7-
"start": "ts-node doc2vec.ts"
15+
"build": "tsc && chmod 755 dist/doc2vec.js",
16+
"start": "node dist/doc2vec.js"
817
},
918
"keywords": [],
1019
"author": "",

tsconfig.json

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"compilerOptions": {
3-
"target": "es2022",
4-
"module": "commonjs",
3+
"target": "ES2020",
4+
"module": "CommonJS",
55
"lib": ["es2022", "dom"],
66
"declaration": false,
77
"sourceMap": false,
@@ -11,8 +11,8 @@
1111
"esModuleInterop": true,
1212
"skipLibCheck": true,
1313
"forceConsistentCasingInFileNames": true,
14-
"moduleResolution": "node"
14+
"moduleResolution": "node"
1515
},
1616
"include": ["**/*.ts"],
17-
"exclude": ["node_modules"]
17+
"exclude": ["node_modules", "mcp", "solo-reference-architectures"]
1818
}

0 commit comments

Comments
 (0)