19
19
* resources that are loaded by a given website URL and emits tuples of
20
20
* kind {@code <resource URL, website URL>}.
21
21
*/
22
- public class WebFinderMapper extends
23
- Mapper <LongWritable , Text , Text , Text > {
22
+ public class WebFinderMapper
23
+ extends Mapper <LongWritable , Text , Text , Text > {
24
24
25
25
/** the logger we use */
26
26
private static Logger LOGGER = Logger .getLogger (WebFinderMapper .class );
@@ -39,30 +39,31 @@ protected void map(final LongWritable offset, final Text line,
39
39
final HashSet <URL > done ;
40
40
String str ;
41
41
42
- maxDepth = context .getConfiguration ().getInt ("maxDepth" , 1 );
43
-
44
42
str = WebFinderMapper .__prepare (line .toString (), true );
45
- if (str == null ) {
43
+ if (str == null ) {// prepare base url
46
44
return ;
47
45
}
46
+ // set maximum depth of spider
47
+ maxDepth = context .getConfiguration ().getInt ("maxDepth" , 1 );
48
48
49
49
baseUri = URI .create (str ).normalize ();
50
50
baseUrl = baseUri .toURL ();
51
- done = new HashSet <>();
51
+ done = new HashSet <>();// URLs that have been processed
52
52
done .add (baseUrl );
53
53
try {
54
54
done .add (new URL (baseUrl .toString () + '/' ));
55
55
} catch (@ SuppressWarnings ("unused" ) final Throwable error ) {
56
56
// ignore
57
57
}
58
58
baseUrlText = new Text (baseUrl .toString ());
59
- context .write (baseUrlText , baseUrlText );
59
+ context .write (baseUrlText , baseUrlText );// url itself is done
60
+ // now recursively spider resources
60
61
WebFinderMapper .__load (maxDepth , baseUrl , baseUrlText , baseUrl ,
61
62
baseUri , new StringBuilder (), new char [16384 ], done , context );
62
63
}
63
64
64
65
/**
65
- * load a given URL
66
+ * load a given URL of a HTML document
66
67
*
67
68
* @param remainingDepth
68
69
* how deep we can still go
@@ -105,7 +106,7 @@ private static final void __load(final int remainingDepth,
105
106
int read ;
106
107
107
108
stringBuilder .setLength (0 );
108
- uconn = loadUrl .openConnection ();
109
+ uconn = loadUrl .openConnection (); // setup the connection
109
110
uconn .setConnectTimeout (10_000 );
110
111
uconn .setReadTimeout (10_000 );
111
112
uconn .setDoInput (true );
@@ -114,26 +115,26 @@ private static final void __load(final int remainingDepth,
114
115
uconn .setDefaultUseCaches (true );
115
116
try (final InputStream inputStream = loadUrl .openStream ()) {
116
117
try (final InputStreamReader inputReader = new InputStreamReader (
117
- inputStream )) {
118
+ inputStream )) { // load all the data of the text resource
118
119
while ((read = inputReader .read (buffer )) > 0 ) {
119
120
stringBuilder .append (buffer , 0 , read );
120
121
}
121
122
}
122
123
}
123
124
124
- text = stringBuilder .toString ().replace ('\n' , ' ' )//
125
+ text = stringBuilder .toString ().replace ('\n' , ' ' )// delete newlines
125
126
.replace ('\r' , ' ' ).replace ('\t' , ' ' ).replaceAll (" " , " " );
126
- lower = text .toLowerCase ();
127
+ lower = text .toLowerCase (); // create a lower case version
127
128
128
129
nextDesc : for (final __LinkDesc desc : WebFinderMapper .DESCS ) {
129
130
130
- last = 0 ;// find and load scripts
131
- findDesc : for (;;) {
131
+ last = 0 ;// find and load other resources
132
+ findDesc : for (;;) {// find begin tag
132
133
index1 = lower .indexOf (desc .m_begin , last );
133
134
if (index1 <= last ) {
134
135
continue nextDesc ;
135
136
}
136
- last = index1 + desc .m_begin .length ();
137
+ last = index1 + desc .m_begin .length ();// find URL attribute
137
138
index1 = lower .indexOf (desc .m_urlIndicatorQuote , last );
138
139
index2 = lower .indexOf (desc .m_urlIndicatorPrime , last );
139
140
sep = '"' ;
@@ -144,7 +145,7 @@ private static final void __load(final int remainingDepth,
144
145
}
145
146
}
146
147
index2 = lower .indexOf ('>' , last );
147
- if (index1 <= last ) {
148
+ if (index1 <= last ) {// check for problem with tag end
148
149
continue nextDesc ;
149
150
}
150
151
if ((index2 < index1 ) && (index2 >= last )) {
@@ -157,7 +158,7 @@ private static final void __load(final int remainingDepth,
157
158
continue nextDesc ;
158
159
}
159
160
160
- test = text .substring (last , index1 );
161
+ test = text .substring (last , index1 );// take URL
161
162
last = index1 ;
162
163
test = WebFinderMapper .__prepare (test , desc .m_loadRecursive );
163
164
if (test == null ) {
@@ -181,8 +182,8 @@ private static final void __load(final int remainingDepth,
181
182
error .addSuppressed (error2 );
182
183
error .addSuppressed (error3 );
183
184
if (WebFinderMapper .LOGGER != null ) {
184
- WebFinderMapper .LOGGER . warn (
185
- "Error while trying to build URL with string '"
185
+ WebFinderMapper .LOGGER
186
+ . warn ( "Error while trying to build URL with string '"
186
187
+ test + "' under load URL '"
187
188
+ loadUrl .toString () + "' for base URL '"
188
189
+ baseUrl .toString () + "'." , error2 );
@@ -225,8 +226,8 @@ private static final void __load(final int remainingDepth,
225
226
}
226
227
} catch (final Throwable error ) {
227
228
if (WebFinderMapper .LOGGER != null ) {
228
- WebFinderMapper .LOGGER .warn ("Error while trying to load URL '"
229
- + loadUrl + "'." , error );
229
+ WebFinderMapper .LOGGER .warn (
230
+ "Error while trying to load URL '" + loadUrl + "'." , error );
230
231
}
231
232
}
232
233
@@ -324,7 +325,7 @@ public static final void main(final String[] args) throws Throwable {
324
325
325
326
/** the link descriptions */
326
327
static final __LinkDesc [] DESCS = { //
327
- new __LinkDesc (false , "<link rel=\" stylesheet\" " , "href=" ), //
328
+ new __LinkDesc (false , "<link rel=\" stylesheet\" " , "href=" ), //
328
329
new __LinkDesc (false , "<link rel='stylesheet'" , "href=" ), //
329
330
new __LinkDesc (false , "<img" , "src=" ), //
330
331
new __LinkDesc (false , "<script" , "src=" ), //
0 commit comments