Skip to content

Commit b658f50

Browse files
Support XLSX files (#2403)
* support xlsx files * lint * create seperate docs for each xlsx sheet * lint * use node-xlsx pkg for parsing xslx files * lint * update error handling --------- Co-authored-by: timothycarambat <[email protected]>
1 parent 93d6464 commit b658f50

File tree

4 files changed

+132
-1
lines changed

4 files changed

+132
-1
lines changed

collector/package.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"mime": "^3.0.0",
3434
"moment": "^2.29.4",
3535
"node-html-parser": "^6.1.13",
36+
"node-xlsx": "^0.24.0",
3637
"officeparser": "^4.0.5",
3738
"openai": "4.38.5",
3839
"pdf-parse": "^1.1.1",
@@ -48,4 +49,4 @@
4849
"nodemon": "^2.0.22",
4950
"prettier": "^2.4.1"
5051
}
51-
}
52+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
const { v4 } = require("uuid");
2+
const xlsx = require("node-xlsx").default;
3+
const path = require("path");
4+
const fs = require("fs");
5+
const {
6+
createdDate,
7+
trashFile,
8+
writeToServerDocuments,
9+
} = require("../../utils/files");
10+
const { tokenizeString } = require("../../utils/tokenizer");
11+
const { default: slugify } = require("slugify");
12+
13+
function convertToCSV(data) {
14+
return data
15+
.map((row) =>
16+
row
17+
.map((cell) => {
18+
if (cell === null || cell === undefined) return "";
19+
if (typeof cell === "string" && cell.includes(","))
20+
return `"${cell}"`;
21+
return cell;
22+
})
23+
.join(",")
24+
)
25+
.join("\n");
26+
}
27+
28+
async function asXlsx({ fullFilePath = "", filename = "" }) {
29+
const documents = [];
30+
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
31+
lower: true,
32+
trim: true,
33+
});
34+
35+
const outFolderPath =
36+
process.env.NODE_ENV === "development"
37+
? path.resolve(
38+
__dirname,
39+
`../../../server/storage/documents/${folderName}`
40+
)
41+
: path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);
42+
43+
try {
44+
const workSheetsFromFile = xlsx.parse(fullFilePath);
45+
if (!fs.existsSync(outFolderPath))
46+
fs.mkdirSync(outFolderPath, { recursive: true });
47+
48+
for (const sheet of workSheetsFromFile) {
49+
try {
50+
const { name, data } = sheet;
51+
const content = convertToCSV(data);
52+
53+
if (!content?.length) {
54+
console.warn(`Sheet "${name}" is empty. Skipping.`);
55+
continue;
56+
}
57+
58+
console.log(`-- Processing sheet: ${name} --`);
59+
const sheetData = {
60+
id: v4(),
61+
url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
62+
title: `${filename} - Sheet:${name}`,
63+
docAuthor: "Unknown",
64+
description: `Spreadsheet data from sheet: ${name}`,
65+
docSource: "an xlsx file uploaded by the user.",
66+
chunkSource: "",
67+
published: createdDate(fullFilePath),
68+
wordCount: content.split(/\s+/).length,
69+
pageContent: content,
70+
token_count_estimate: tokenizeString(content).length,
71+
};
72+
73+
const document = writeToServerDocuments(
74+
sheetData,
75+
`sheet-${slugify(name)}`,
76+
outFolderPath
77+
);
78+
documents.push(document);
79+
console.log(
80+
`[SUCCESS]: Sheet "${name}" converted & ready for embedding.`
81+
);
82+
} catch (err) {
83+
console.error(`Error processing sheet "${name}":`, err);
84+
continue;
85+
}
86+
}
87+
} catch (err) {
88+
console.error("Could not process xlsx file!", err);
89+
return {
90+
success: false,
91+
reason: `Error processing ${filename}: ${err.message}`,
92+
documents: [],
93+
};
94+
} finally {
95+
trashFile(fullFilePath);
96+
}
97+
98+
if (documents.length === 0) {
99+
console.error(`No valid sheets found in ${filename}.`);
100+
return {
101+
success: false,
102+
reason: `No valid sheets found in ${filename}.`,
103+
documents: [],
104+
};
105+
}
106+
107+
console.log(
108+
`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`
109+
);
110+
return { success: true, reason: null, documents };
111+
}
112+
113+
module.exports = asXlsx;

collector/utils/constants.js

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ const ACCEPTED_MIMES = {
1111
".pptx",
1212
],
1313

14+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [
15+
".xlsx",
16+
],
17+
1418
"application/vnd.oasis.opendocument.text": [".odt"],
1519
"application/vnd.oasis.opendocument.presentation": [".odp"],
1620

@@ -41,6 +45,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
4145
".odt": "./convert/asOfficeMime.js",
4246
".odp": "./convert/asOfficeMime.js",
4347

48+
".xlsx": "./convert/asXlsx.js",
49+
4450
".mbox": "./convert/asMbox.js",
4551

4652
".epub": "./convert/asEPub.js",

collector/yarn.lock

+11
Original file line numberDiff line numberDiff line change
@@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13:
23262326
css-select "^5.1.0"
23272327
he "1.2.0"
23282328

2329+
node-xlsx@^0.24.0:
2330+
version "0.24.0"
2331+
resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f"
2332+
integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg==
2333+
dependencies:
2334+
xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"
2335+
23292336
23302337
version "6.9.13"
23312338
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6"
@@ -3528,6 +3535,10 @@ [email protected]:
35283535
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f"
35293536
integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==
35303537

3538+
"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz":
3539+
version "0.20.2"
3540+
resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d"
3541+
35313542
xml2js@^0.6.2:
35323543
version "0.6.2"
35333544
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499"

0 commit comments

Comments
 (0)