-
Notifications
You must be signed in to change notification settings - Fork 192
/
Copy pathextract.js
164 lines (143 loc) · 5.01 KB
/
extract.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
var fs = require( 'fs' )
, path = require( 'path' )
, XmlEntities = require( 'html-entities' ).XmlEntities
, util = require( './util' )
, extractorPath = path.join( __dirname, 'extractors' )
, entities = new XmlEntities()
, typeExtractors = {}
, regexExtractors = []
, failedExtractorTypes = {}
, totalExtractors = 0
, satisfiedExtractors = 0
, hasInitialized = false
, STRIP_ONLY_SINGLE_LINEBREAKS = /(^|[^\n])\n(?!\n)/g
, WHITELIST_PRESERVE_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \uFB50–\uFDFF \uFE70–\uFEFF \uFF01-\uFFE6 \.,\?""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w\n\r]*/g // eslint-disable-line max-len
, WHITELIST_STRIP_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \uFB50–\uFDFF \uFE70–\uFEFF \uFF01-\uFFE6 \.,\?""!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w]*/g // eslint-disable-line max-len
;
function registerExtractor( extractor ) {
if ( extractor.types ) {
extractor.types.forEach( function( type ) {
if ( typeof type === 'string' ) {
type = type.toLowerCase();
typeExtractors[type] = extractor.extract;
} else {
if ( type instanceof RegExp ) {
regexExtractors.push({ reg: type, extractor: extractor.extract });
}
}
});
}
}
function registerFailedExtractor( extractor, failedMessage ) {
if ( extractor.types ) {
extractor.types.forEach( function( type ) {
failedExtractorTypes[type.toLowerCase()] = failedMessage;
});
}
}
function testExtractor( extractor, options ) {
extractor.test( options, function( passedTest, failedMessage ) {
satisfiedExtractors++;
if ( passedTest ) {
registerExtractor( extractor );
} else {
registerFailedExtractor( extractor, failedMessage );
}
});
}
// global, all file type, content cleansing
function cleanseText( options, cb ) {
return function( error, text ) {
if ( !error ) {
// clean up text
text = util.replaceBadCharacters( text );
if ( options.preserveLineBreaks || options.preserveOnlyMultipleLineBreaks ) {
if ( options.preserveOnlyMultipleLineBreaks ) {
text = text.replace( STRIP_ONLY_SINGLE_LINEBREAKS, '$1 ' ).trim();
}
text = text.replace( WHITELIST_PRESERVE_LINEBREAKS, ' ' );
} else {
text = text.replace( WHITELIST_STRIP_LINEBREAKS, ' ' );
}
// multiple spaces, tabs, vertical tabs, non-breaking space]
text = text.replace( / (?! )/g, '' )
.replace( /[ \t\v\u00A0]{2,}/g, ' ' );
text = entities.decode( text );
}
cb( error, text );
};
}
function initializeExtractors( options ) {
var extractors;
hasInitialized = true;
// discover available extractors
extractors = fs.readdirSync( extractorPath ).map( function( item ) {
var fullExtractorPath = path.join( extractorPath, item );
// get the extractor
// eslint-disable-next-line global-require
return require( fullExtractorPath );
});
// perform any binary tests to ensure extractor is possible
// given execution environment
extractors.forEach( function( extractor ) {
if ( extractor.test ) {
testExtractor( extractor, options );
} else {
satisfiedExtractors++;
registerExtractor( extractor );
}
});
// need to keep track of how many extractors we have in total
totalExtractors = extractors.length;
}
function findExtractor( type ) {
var i
, iLen = regexExtractors.length
, extractor
, regexExtractor
;
type = type.toLowerCase();
if ( typeExtractors[type] ) {
extractor = typeExtractors[type];
} else {
for ( i = 0; i < iLen; i++ ) {
regexExtractor = regexExtractors[i];
if ( type.match( regexExtractor.reg ) ) {
extractor = regexExtractor.extractor;
}
}
}
return extractor;
}
function extract( type, filePath, options, cb ) {
var error, msg, theExtractor;
if ( !hasInitialized ) {
initializeExtractors( options );
}
// registration of extractors complete?
if ( totalExtractors === satisfiedExtractors ) {
theExtractor = findExtractor( type );
if ( theExtractor ) {
cb = cleanseText( options, cb );
theExtractor( filePath, options, cb );
} else {
// cannot extract this file type
msg = 'Error for type: [[ ' + type + ' ]], file: [[ ' + filePath + ' ]]';
// update error message if type is supported but just not configured/installed properly
if ( failedExtractorTypes[type] ) {
msg += ', extractor for type exists, but failed to initialize.' +
' Message: ' + failedExtractorTypes[type];
}
error = new Error( msg );
error.typeNotFound = true;
cb( error, null );
}
} else {
// async registration has not wrapped up
// try again later
setTimeout( function() {
extract( type, filePath, options, cb );
}, 100 );
}
}
module.exports = extract;