1+ // bin/commands/data/ingest.js
2+ import fs from 'fs' ;
3+ import path from 'path' ;
4+ import chalk from 'chalk' ;
5+ import { isConfigValid } from '../../utils/validation.js' ;
6+ import MongoRAG from '../../../src/core/MongoRAG.js' ;
7+ import { parseDocument } from '../../utils/document-parsers.js' ;
8+ import { DocumentChunker } from '../../utils/chunking.js' ;
9+
10+ export async function ingestData ( config , options ) {
11+ if ( ! isConfigValid ( config ) ) {
12+ throw new Error ( "Configuration missing. Run 'npx mongodb-rag init' first." ) ;
13+ }
14+
15+ try {
16+ let documents = [ ] ;
17+ const isDevelopment = process . env . NODE_ENV === 'development' || process . env . NODE_ENV === 'test' ;
18+
19+ // Handle directory ingestion
20+ if ( options . directory ) {
21+ if ( isDevelopment ) console . log ( chalk . blue ( `📂 Processing directory: ${ options . directory } ` ) ) ;
22+ documents = await processDirectory ( options . directory , options ) ;
23+ }
24+ // Handle single file ingestion
25+ else if ( options . file ) {
26+ if ( isDevelopment ) console . log ( chalk . blue ( `📄 Processing file: ${ options . file } ` ) ) ;
27+ documents = await processFile ( options . file , options ) ;
28+ }
29+ else {
30+ throw new Error ( "Either --file or --directory option must be specified" ) ;
31+ }
32+
33+ // Initialize chunker if chunking is enabled
34+ if ( options . chunkSize ) {
35+ const chunker = new DocumentChunker ( {
36+ chunkSize : options . chunkSize ,
37+ chunkOverlap : options . chunkOverlap ,
38+ method : options . chunkMethod
39+ } ) ;
40+
41+ // Chunk each document
42+ const chunkedDocs = [ ] ;
43+ for ( const doc of documents ) {
44+ if ( isDevelopment ) {
45+ console . log ( chalk . blue ( `📄 Chunking document: ${ doc . metadata . filename } ` ) ) ;
46+ }
47+ const chunks = chunker . chunkDocument ( doc ) ;
48+ chunkedDocs . push ( ...chunks ) ;
49+
50+ if ( isDevelopment ) {
51+ console . log ( chalk . green ( `✅ Created ${ chunks . length } chunks` ) ) ;
52+ }
53+ }
54+ documents = chunkedDocs ;
55+ }
56+
57+ if ( isDevelopment ) {
58+ console . log ( chalk . blue ( `📊 Found ${ documents . length } documents to process` ) ) ;
59+ }
60+
61+ const rag = new MongoRAG ( config ) ;
62+ await rag . connect ( ) ;
63+
64+ const result = await rag . ingestBatch ( documents , {
65+ database : options . database ,
66+ collection : options . collection
67+ } ) ;
68+
69+ console . log ( chalk . green ( `✅ Successfully ingested ${ result . processed } documents!` ) ) ;
70+ return result ;
71+ } catch ( error ) {
72+ console . error ( chalk . red ( '❌ Ingestion failed:' ) , error . message ) ;
73+ throw error ;
74+ }
75+ }
76+
77+ async function processDirectory ( dirPath , options ) {
78+ const documents = [ ] ;
79+ const files = fs . readdirSync ( dirPath ) ;
80+
81+ for ( const file of files ) {
82+ const filePath = path . join ( dirPath , file ) ;
83+ const stat = fs . statSync ( filePath ) ;
84+
85+ if ( stat . isDirectory ( ) && options . recursive ) {
86+ const subDocs = await processDirectory ( filePath , options ) ;
87+ documents . push ( ...subDocs ) ;
88+ } else if ( stat . isFile ( ) ) {
89+ const docs = await processFile ( filePath , options ) ;
90+ documents . push ( ...docs ) ;
91+ }
92+ }
93+
94+ return documents ;
95+ }
96+
97+ async function processFile ( filePath , options ) {
98+ const ext = path . extname ( filePath ) . toLowerCase ( ) ;
99+ const isDevelopment = process . env . NODE_ENV === 'development' || process . env . NODE_ENV === 'test' ;
100+
101+ try {
102+ // If it's a JSON file, parse it directly
103+ if ( ext === '.json' ) {
104+ const content = fs . readFileSync ( filePath , 'utf-8' ) ;
105+ const data = JSON . parse ( content ) ;
106+ return Array . isArray ( data ) ? data : [ data ] ;
107+ }
108+
109+ // For other file types, use the document parser
110+ const doc = await parseDocument ( filePath , null , options ) ;
111+
112+ if ( isDevelopment ) {
113+ console . log ( chalk . blue ( `📄 Processed ${ filePath } ` ) ) ;
114+ if ( doc . metadata . processingFailed ) {
115+ console . warn ( chalk . yellow ( `⚠️ Warning: ${ doc . metadata . error } ` ) ) ;
116+ }
117+ }
118+
119+ return [ doc ] ;
120+ } catch ( error ) {
121+ if ( isDevelopment ) {
122+ console . error ( chalk . red ( `❌ Failed to process ${ filePath } :` ) , error . message ) ;
123+ }
124+ return [ ] ; // Skip failed files
125+ }
126+ }
0 commit comments