|
| 1 | +#!/usr/bin/env ts-node |
| 2 | + |
1 | 3 | import { Readability } from '@mozilla/readability';
|
2 | 4 | import axios from 'axios';
|
3 | 5 | import { load } from 'cheerio';
|
@@ -41,6 +43,7 @@ interface LocalDirectorySourceConfig extends BaseSourceConfig {
|
41 | 43 | exclude_extensions?: string[]; // File extensions to exclude
|
42 | 44 | recursive?: boolean; // Whether to traverse subdirectories
|
43 | 45 | encoding?: BufferEncoding; // File encoding (default: 'utf8')
|
| 46 | + url_rewrite_prefix?: string; // Optional URL prefix to rewrite file:// URLs (e.g., 'https://mydomain.com') |
44 | 47 | }
|
45 | 48 |
|
46 | 49 | // Configuration specific to website sources
|
@@ -723,8 +726,33 @@ class Doc2Vec {
|
723 | 726 |
|
724 | 727 | logger.info(`Processing content from ${filePath} (${content.length} chars)`);
|
725 | 728 | try {
|
726 |
| - // Use the file path as URL for tracking |
727 |
| - const fileUrl = `file://${filePath}`; |
| 729 | + // Generate URL based on configuration |
| 730 | + let fileUrl: string; |
| 731 | + |
| 732 | + if (config.url_rewrite_prefix) { |
| 733 | + // Replace local path with URL prefix |
| 734 | + const relativePath = path.relative(config.path, filePath).replace(/\\/g, '/'); |
| 735 | + |
| 736 | + // If relativePath starts with '..', it means the file is outside the base directory |
| 737 | + if (relativePath.startsWith('..')) { |
| 738 | + // For files outside the configured path, use the default file:// scheme |
| 739 | + fileUrl = `file://${filePath}`; |
| 740 | + logger.debug(`File outside configured path, using default URL: ${fileUrl}`); |
| 741 | + } else { |
| 742 | + // For files inside the configured path, rewrite the URL |
| 743 | + // Handle trailing slashes in the URL prefix to avoid double slashes |
| 744 | + const prefix = config.url_rewrite_prefix.endsWith('/') |
| 745 | + ? config.url_rewrite_prefix.slice(0, -1) |
| 746 | + : config.url_rewrite_prefix; |
| 747 | + |
| 748 | + fileUrl = `${prefix}/${relativePath}`; |
| 749 | + logger.debug(`URL rewritten: ${filePath} -> ${fileUrl}`); |
| 750 | + } |
| 751 | + } else { |
| 752 | + // Use default file:// URL |
| 753 | + fileUrl = `file://${filePath}`; |
| 754 | + } |
| 755 | + |
728 | 756 | const chunks = await this.chunkMarkdown(content, config, fileUrl);
|
729 | 757 | logger.info(`Created ${chunks.length} chunks`);
|
730 | 758 |
|
@@ -804,15 +832,13 @@ class Doc2Vec {
|
804 | 832 | logger
|
805 | 833 | );
|
806 | 834 |
|
807 |
| - logger.info(`Found ${validChunkIds.size} valid chunks across processed files for ${config.path}`); |
808 |
| - |
809 | 835 | logger.section('CLEANUP');
|
810 | 836 | if (dbConnection.type === 'sqlite') {
|
811 | 837 | logger.info(`Running SQLite cleanup for local directory ${config.path}`);
|
812 |
| - this.removeObsoleteFilesSQLite(dbConnection.db, processedFiles, config.path, logger); |
| 838 | + this.removeObsoleteFilesSQLite(dbConnection.db, processedFiles, config, logger); |
813 | 839 | } else if (dbConnection.type === 'qdrant') {
|
814 | 840 | logger.info(`Running Qdrant cleanup for local directory ${config.path} in collection ${dbConnection.collectionName}`);
|
815 |
| - await this.removeObsoleteFilesQdrant(dbConnection, processedFiles, config.path, logger); |
| 841 | + await this.removeObsoleteFilesQdrant(dbConnection, processedFiles, config, logger); |
816 | 842 | }
|
817 | 843 |
|
818 | 844 | logger.info(`Finished processing local directory: ${config.path}`);
|
@@ -919,6 +945,160 @@ class Doc2Vec {
|
919 | 945 | }
|
920 | 946 | }
|
921 | 947 |
|
| 948 | + private removeObsoleteFilesSQLite( |
| 949 | + db: Database, |
| 950 | + processedFiles: Set<string>, |
| 951 | + pathConfig: { path: string; url_rewrite_prefix?: string } | string, |
| 952 | + logger: Logger |
| 953 | + ) { |
| 954 | + const getChunksForPathStmt = db.prepare(` |
| 955 | + SELECT chunk_id, url FROM vec_items |
| 956 | + WHERE url LIKE ? || '%' |
| 957 | + `); |
| 958 | + const deleteChunkStmt = db.prepare(`DELETE FROM vec_items WHERE chunk_id = ?`); |
| 959 | + |
| 960 | + // Determine if we're using URL rewriting or direct file paths |
| 961 | + const isRewriteMode = typeof pathConfig === 'object' && pathConfig.url_rewrite_prefix; |
| 962 | + |
| 963 | + // Set up the URL prefix for searching |
| 964 | + let urlPrefix: string; |
| 965 | + if (isRewriteMode) { |
| 966 | + // Handle URL rewriting case |
| 967 | + urlPrefix = (pathConfig as { path: string; url_rewrite_prefix?: string }).url_rewrite_prefix || ''; |
| 968 | + urlPrefix = urlPrefix.endsWith('/') ? urlPrefix.slice(0, -1) : urlPrefix; |
| 969 | + } else { |
| 970 | + // Handle direct file path case |
| 971 | + const dirPrefix = typeof pathConfig === 'string' ? pathConfig : pathConfig.path; |
| 972 | + const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, ''); |
| 973 | + urlPrefix = `file://${cleanedDirPrefix}`; |
| 974 | + } |
| 975 | + |
| 976 | + logger.debug(`Searching for chunks with URL prefix: ${urlPrefix}`); |
| 977 | + const existingChunks = getChunksForPathStmt.all(urlPrefix) as { chunk_id: string; url: string }[]; |
| 978 | + let deletedCount = 0; |
| 979 | + |
| 980 | + const transaction = db.transaction(() => { |
| 981 | + for (const { chunk_id, url } of existingChunks) { |
| 982 | + // Skip if it's not from our URL prefix (safety check) |
| 983 | + if (!url.startsWith(urlPrefix)) continue; |
| 984 | + |
| 985 | + let filePath: string; |
| 986 | + let shouldDelete = false; |
| 987 | + |
| 988 | + if (isRewriteMode) { |
| 989 | + // URL rewrite mode: extract relative path and construct full file path |
| 990 | + const config = pathConfig as { path: string; url_rewrite_prefix?: string }; |
| 991 | + const relativePath = url.substring(urlPrefix.length + 1); // +1 for the '/' |
| 992 | + filePath = path.join(config.path, relativePath); |
| 993 | + shouldDelete = !processedFiles.has(filePath); |
| 994 | + } else { |
| 995 | + // Direct file path mode: remove file:// prefix to match with processedFiles |
| 996 | + filePath = url.substring(7); // Remove 'file://' prefix |
| 997 | + shouldDelete = !processedFiles.has(filePath); |
| 998 | + } |
| 999 | + |
| 1000 | + if (shouldDelete) { |
| 1001 | + logger.debug(`Deleting obsolete chunk from SQLite: ${chunk_id.substring(0, 8)}... (File not processed: ${filePath})`); |
| 1002 | + deleteChunkStmt.run(chunk_id); |
| 1003 | + deletedCount++; |
| 1004 | + } |
| 1005 | + } |
| 1006 | + }); |
| 1007 | + transaction(); |
| 1008 | + |
| 1009 | + logger.info(`Deleted ${deletedCount} obsolete chunks from SQLite for URL prefix ${urlPrefix}`); |
| 1010 | + } |
| 1011 | + |
| 1012 | + private async removeObsoleteFilesQdrant( |
| 1013 | + db: QdrantDB, |
| 1014 | + processedFiles: Set<string>, |
| 1015 | + pathConfig: { path: string; url_rewrite_prefix?: string } | string, |
| 1016 | + logger: Logger |
| 1017 | + ) { |
| 1018 | + const { client, collectionName } = db; |
| 1019 | + try { |
| 1020 | + // Determine if we're using URL rewriting or direct file paths |
| 1021 | + const isRewriteMode = typeof pathConfig === 'object' && pathConfig.url_rewrite_prefix; |
| 1022 | + |
| 1023 | + // Set up the URL prefix for searching |
| 1024 | + let urlPrefix: string; |
| 1025 | + if (isRewriteMode) { |
| 1026 | + // Handle URL rewriting case |
| 1027 | + urlPrefix = (pathConfig as { path: string; url_rewrite_prefix?: string }).url_rewrite_prefix || ''; |
| 1028 | + urlPrefix = urlPrefix.endsWith('/') ? urlPrefix.slice(0, -1) : urlPrefix; |
| 1029 | + } else { |
| 1030 | + // Handle direct file path case |
| 1031 | + const dirPrefix = typeof pathConfig === 'string' ? pathConfig : pathConfig.path; |
| 1032 | + const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, ''); |
| 1033 | + urlPrefix = `file://${cleanedDirPrefix}`; |
| 1034 | + } |
| 1035 | + |
| 1036 | + logger.debug(`Checking for obsolete chunks with URL prefix: ${urlPrefix}`); |
| 1037 | + const response = await client.scroll(collectionName, { |
| 1038 | + limit: 10000, |
| 1039 | + with_payload: true, |
| 1040 | + with_vector: false, |
| 1041 | + filter: { |
| 1042 | + must: [ |
| 1043 | + { |
| 1044 | + key: "url", |
| 1045 | + match: { |
| 1046 | + text: urlPrefix |
| 1047 | + } |
| 1048 | + } |
| 1049 | + ], |
| 1050 | + must_not: [ |
| 1051 | + { |
| 1052 | + key: "is_metadata", |
| 1053 | + match: { |
| 1054 | + value: true |
| 1055 | + } |
| 1056 | + } |
| 1057 | + ] |
| 1058 | + } |
| 1059 | + }); |
| 1060 | + |
| 1061 | + const obsoletePointIds = response.points |
| 1062 | + .filter((point: any) => { |
| 1063 | + const url = point.payload?.url; |
| 1064 | + // Double check it's not a metadata record |
| 1065 | + if (point.payload?.is_metadata === true) { |
| 1066 | + return false; |
| 1067 | + } |
| 1068 | + |
| 1069 | + if (!url || !url.startsWith(urlPrefix)) { |
| 1070 | + return false; |
| 1071 | + } |
| 1072 | + |
| 1073 | + let filePath: string; |
| 1074 | + |
| 1075 | + if (isRewriteMode) { |
| 1076 | + // URL rewrite mode: extract relative path and construct full file path |
| 1077 | + const config = pathConfig as { path: string; url_rewrite_prefix?: string }; |
| 1078 | + const relativePath = url.substring(urlPrefix.length + 1); // +1 for the '/' |
| 1079 | + filePath = path.join(config.path, relativePath); |
| 1080 | + } else { |
| 1081 | + // Direct file path mode: remove file:// prefix to match with processedFiles |
| 1082 | + filePath = url.startsWith('file://') ? url.substring(7) : ''; |
| 1083 | + } |
| 1084 | + |
| 1085 | + return filePath && !processedFiles.has(filePath); |
| 1086 | + }) |
| 1087 | + .map((point: any) => point.id); |
| 1088 | + |
| 1089 | + if (obsoletePointIds.length > 0) { |
| 1090 | + await client.delete(collectionName, { |
| 1091 | + points: obsoletePointIds, |
| 1092 | + }); |
| 1093 | + logger.info(`Deleted ${obsoletePointIds.length} obsolete chunks from Qdrant for URL prefix ${urlPrefix}`); |
| 1094 | + } else { |
| 1095 | + logger.info(`No obsolete chunks to delete from Qdrant for URL prefix ${urlPrefix}`); |
| 1096 | + } |
| 1097 | + } catch (error) { |
| 1098 | + logger.error(`Error removing obsolete chunks from Qdrant:`, error); |
| 1099 | + } |
| 1100 | + } |
| 1101 | + |
922 | 1102 | private async initDatabase(config: SourceConfig, parentLogger: Logger): Promise<DatabaseConnection> {
|
923 | 1103 | const logger = parentLogger.child('database');
|
924 | 1104 | const dbConfig = config.database_config;
|
@@ -1114,93 +1294,6 @@ class Doc2Vec {
|
1114 | 1294 |
|
1115 | 1295 | logger.info(`Deleted ${deletedCount} obsolete chunks from SQLite for URL ${urlPrefix}`);
|
1116 | 1296 | }
|
1117 |
| - |
1118 |
| - private removeObsoleteFilesSQLite(db: Database, processedFiles: Set<string>, dirPrefix: string, logger: Logger) { |
1119 |
| - const getChunksForDirStmt = db.prepare(` |
1120 |
| - SELECT chunk_id, url FROM vec_items |
1121 |
| - WHERE url LIKE 'file://%' AND url LIKE ? |
1122 |
| - `); |
1123 |
| - const deleteChunkStmt = db.prepare(`DELETE FROM vec_items WHERE chunk_id = ?`); |
1124 |
| - |
1125 |
| - const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, ''); |
1126 |
| - const filePrefix = `file://${cleanedDirPrefix}`; |
1127 |
| - |
1128 |
| - const existingChunks = getChunksForDirStmt.all(`${filePrefix}%`) as { chunk_id: string; url: string }[]; |
1129 |
| - let deletedCount = 0; |
1130 |
| - |
1131 |
| - const transaction = db.transaction(() => { |
1132 |
| - for (const { chunk_id, url } of existingChunks) { |
1133 |
| - // Remove file:// prefix to match with processedFiles |
1134 |
| - const filePath = url.substring(7); |
1135 |
| - if (!processedFiles.has(filePath)) { |
1136 |
| - logger.debug(`Deleting obsolete chunk from SQLite: ${chunk_id.substring(0, 8)}... (File not processed)`); |
1137 |
| - deleteChunkStmt.run(chunk_id); |
1138 |
| - deletedCount++; |
1139 |
| - } |
1140 |
| - } |
1141 |
| - }); |
1142 |
| - transaction(); |
1143 |
| - |
1144 |
| - logger.info(`Deleted ${deletedCount} obsolete chunks from SQLite for directory ${dirPrefix}`); |
1145 |
| - } |
1146 |
| - |
1147 |
| - private async removeObsoleteFilesQdrant(db: QdrantDB, processedFiles: Set<string>, dirPrefix: string, logger: Logger) { |
1148 |
| - const { client, collectionName } = db; |
1149 |
| - try { |
1150 |
| - |
1151 |
| - const cleanedDirPrefix = dirPrefix.replace(/^\.\/+/, ''); |
1152 |
| - const filePrefix = `file://${cleanedDirPrefix}`; |
1153 |
| - |
1154 |
| - // Get all points that match the file prefix but are not metadata points |
1155 |
| - const response = await client.scroll(collectionName, { |
1156 |
| - limit: 10000, |
1157 |
| - with_payload: true, |
1158 |
| - with_vector: false, |
1159 |
| - filter: { |
1160 |
| - must: [ |
1161 |
| - { |
1162 |
| - key: "url", |
1163 |
| - match: { |
1164 |
| - text: `${filePrefix}` |
1165 |
| - } |
1166 |
| - } |
1167 |
| - ], |
1168 |
| - must_not: [ |
1169 |
| - { |
1170 |
| - key: "is_metadata", |
1171 |
| - match: { |
1172 |
| - value: true |
1173 |
| - } |
1174 |
| - } |
1175 |
| - ] |
1176 |
| - } |
1177 |
| - }); |
1178 |
| - |
1179 |
| - const obsoletePointIds = response.points |
1180 |
| - .filter((point: any) => { |
1181 |
| - const url = point.payload?.url; |
1182 |
| - // Double check it's not a metadata record |
1183 |
| - if (point.payload?.is_metadata === true) { |
1184 |
| - return false; |
1185 |
| - } |
1186 |
| - // Remove file:// prefix to match with processedFiles |
1187 |
| - const filePath = url && url.startsWith('file://') ? url.substring(7) : ''; |
1188 |
| - return filePath && !processedFiles.has(filePath); |
1189 |
| - }) |
1190 |
| - .map((point: any) => point.id); |
1191 |
| - |
1192 |
| - if (obsoletePointIds.length > 0) { |
1193 |
| - await client.delete(collectionName, { |
1194 |
| - points: obsoletePointIds, |
1195 |
| - }); |
1196 |
| - logger.info(`Deleted ${obsoletePointIds.length} obsolete chunks from Qdrant for directory ${dirPrefix}`); |
1197 |
| - } else { |
1198 |
| - logger.info(`No obsolete chunks to delete from Qdrant for directory ${dirPrefix}`); |
1199 |
| - } |
1200 |
| - } catch (error) { |
1201 |
| - logger.error(`Error removing obsolete chunks from Qdrant:`, error); |
1202 |
| - } |
1203 |
| - } |
1204 | 1297 |
|
1205 | 1298 | private isValidUuid(str: string): boolean {
|
1206 | 1299 | const uuidRegex = /^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i;
|
|
0 commit comments