-
Notifications
You must be signed in to change notification settings - Fork 192
/
Copy pathutil.js
163 lines (151 loc) · 4.84 KB
/
util.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
var exec = require( 'child_process' ).exec
, path = require( 'path' )
, fs = require( 'fs' )
, os = require( 'os' )
, outDir = path.join( os.tmpdir(), 'textract' )
, replacements = [
[/[\u201C|\u201D|]|“|â€/g, '"'], // fancy double quotes
[/[\u2018|\u2019]|’|‘]/g, '\''], // fancy single quotes/apostrophes
[/…/g, '…'], // elipses
[/–|—/g, '–'] // long hyphen
]
, rLen = replacements.length
;
// Up front creation of tmp dir
if ( !fs.existsSync( outDir ) ) {
fs.mkdirSync( outDir );
}
// replace nasty quotes with simple ones
function replaceBadCharacters( text ) {
var i, repl;
for ( i = 0; i < rLen; i++ ) {
repl = replacements[i];
text = text.replace( repl[0], repl[1] );
}
return text;
}
function yauzlError( err, cb ) {
var msg = err.message;
if ( msg === 'end of central directory record signature not found' ) {
msg = 'File not correctly recognized as zip file, ' + msg;
}
cb( new Error( msg ), null );
}
function createExecOptions( type, options ) {
var execOptions = {};
if ( options[type] && options[type].exec ) {
execOptions = options[type].exec;
} else {
if ( options.exec ) {
execOptions = options.exec;
}
}
return execOptions;
}
function unzipCheck( type, cb ) {
exec( 'unzip',
function( error /* , stdout, stderr */ ) {
if ( error ) {
// eslint-disable-next-line no-console
console.error( 'textract: \'unzip\' does not appear to be installed, ' +
'so textract will be unable to extract ' + type + '.' );
}
cb( error === null );
}
);
}
function getTextFromZipFile( zipfile, entry, cb ) {
zipfile.openReadStream( entry, function( err, readStream ) {
var text = ''
, error = ''
;
if ( err ) {
cb( err, null );
return;
}
readStream.on( 'data', function( chunk ) {
text += chunk;
});
readStream.on( 'end', function() {
if ( error.length > 0 ) {
cb( error, null );
} else {
cb( null, text );
}
});
readStream.on( 'error', function( _err ) {
error += _err;
});
});
}
/**
* 1) builds an exec command using provided `genCommand` callback
* 2) runs that command against an input file path
* resulting in an output file
* 3) reads that output file in
* 4) cleans the output file up
* 5) executes a callback with the contents of the file
*
* @param {string} label Name for the extractor, e.g. `Tesseract`
* @param {string} filePath path to file to be extractor
* @param {object} options extractor options as provided
* via user configuration
* @param {object} execOptions execution options passed to
* `exec` commmand as provided via user configuration
* @param {function} genCommand function used to generate
* the command to be executed
* @param {string} cb callback that is passed error/text
*
*/
function runExecIntoFile( label, filePath, options, execOptions, genCommand, cb ) {
// escape the file paths
var fileTempOutPath = path.join( outDir, path.basename( filePath, path.extname( filePath ) ) )
, escapedFilePath = filePath.replace( /\s/g, '\\ ' )
, escapedFileTempOutPath = fileTempOutPath.replace( /\s/g, '\\ ' )
, cmd = genCommand( options, escapedFilePath, escapedFileTempOutPath )
;
exec( cmd, execOptions,
function( error /* , stdout, stderr */ ) {
if ( error !== null ) {
error = new Error( 'Error extracting [[ ' +
path.basename( filePath ) + ' ]], exec error: ' + error.message );
cb( error, null );
return;
}
fs.exists( fileTempOutPath + '.txt', function( exists ) {
if ( exists ) {
fs.readFile( fileTempOutPath + '.txt', 'utf8', function( error2, text ) {
if ( error2 ) {
error2 = new Error( 'Error reading' + label +
' output at [[ ' + fileTempOutPath + ' ]], error: ' + error2.message );
cb( error2, null );
} else {
fs.unlink( fileTempOutPath + '.txt', function( error3 ) {
if ( error3 ) {
error3 = new Error( 'Error, ' + label +
' , cleaning up temp file [[ ' + fileTempOutPath +
' ]], error: ' + error3.message );
cb( error3, null );
} else {
cb( null, text.toString() );
}
});
}
});
} else {
error = new Error( 'Error reading ' + label +
' output at [[ ' + fileTempOutPath + ' ]], file does not exist' );
cb( error, null );
}
});
}
);
}
module.exports = {
createExecOptions: createExecOptions,
unzipCheck: unzipCheck,
getTextFromZipFile: getTextFromZipFile,
yauzlError: yauzlError,
runExecIntoFile: runExecIntoFile,
replaceBadCharacters: replaceBadCharacters
};