-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathindex.js
More file actions
112 lines (102 loc) · 2.73 KB
/
index.js
File metadata and controls
112 lines (102 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
const fs = require('fs');
const StreamZip = require('node-stream-zip');
const XLSX = require('xlsx');
const pdf = require('pdf-parse');
let WordExtractor = require('word-extractor');
// extract text from office books as doc and docx
extract = (filePath) => {
return new Promise((resolve, reject) => {
open(filePath).then((res, err) => {
if (err) {
reject(err);
}
let body = '';
let components = res.toString().split('<w:t');
for (let i = 0; i < components.length; i++) {
let tags = components[i].split('>');
let content = tags[1].replace(/<.*$/, '');
body += content;
}
resolve(body);
});
});
};
// stream
open = (filePath) => {
return new Promise((resolve, reject) => {
const zip = new StreamZip({
file: filePath,
storeEntries: true,
});
zip.on('ready', () => {
let chunks = [];
let content = '';
zip.stream('word/document.xml', (err, stream) => {
if (err) {
reject(err);
}
stream.on('data', (chunk) => {
chunks.push(chunk);
});
stream.on('end', () => {
content = Buffer.concat(chunks);
zip.close();
resolve(content.toString());
});
});
});
});
};
// get the file extension based on the file path
getFileExtension = (filename) => {
if (filename.length == 0) return '';
let dot = filename.lastIndexOf('.');
if (dot == -1) return '';
const extension = filename.substr(dot, filename.length);
return extension;
};
// read the file and extract text
exports.getText = async (filePath) => {
let fileContent = '';
let data = fs.readFileSync(filePath);
const fileExtension = getFileExtension(filePath);
switch (fileExtension) {
// read pdf
case '.pdf':
fileContent = (await pdf(data)).text;
break;
// read docs
case '.docx':
case '.doc':
var extractor = new WordExtractor();
var extracted = await extractor.extract(filePath);
fileContent = extracted.getBody();
break;
// read excel books
case '.xlsx':
case '.xls':
let result = {};
data = new Uint8Array(data);
let workbook = XLSX.read(data, {
type: 'array',
});
workbook.SheetNames.forEach(function (sheetName) {
let roa = XLSX.utils.sheet_to_json(workbook.Sheets[sheetName], {
header: 1,
});
if (roa.length) result[sheetName] = roa;
});
fileContent = JSON.stringify(result);
break;
// read text, csv and json
case '.txt':
case '.csv':
case '.json':
fileContent = data.toString();
break;
// default case
default:
throw new Error('unknown extension found!');
}
return fileContent;
};