Skip to content

Commit 78c2c23

Browse files
author
Thomas Weise
committed
Improved Documentation of Hadoop/MapReduce WebFinder Example
1 parent 19a7ff6 commit 78c2c23

File tree

4 files changed

+40
-11
lines changed

4 files changed

+40
-11
lines changed

hadoop/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,8 @@ In order to run Hadoop in a pseudo-distributed fashion, we need to enable passwo
235235
<li>In the terminal, execute <code>ssh localhost</code> to test if you can open a <a href="https://en.wikipedia.org/wiki/Secure&#95;Shell">secure shell</a> connection to your current, local computer <a href="http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Setup&#95;passphraseless&#95;ssh">without needing a password</a>.
236236
</li>
237237
<li>It may say something like:
238-
<pre>ssh: connect to host localhost port 22: Connection refused</pre>.
239-
If it does say this, then do
238+
<pre>ssh: connect to host localhost port 22: Connection refused</pre>
239+
If it does say this (i.e., you did not install the pre-requisites&hellip;), then do
240240
<pre>sudo apt-get install ssh</pre>
241241
and it may say something like
242242
<pre>

hadoop/webFinder/src/main/java/webFinder/WebFinderDriver.java

+5
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
import org.apache.hadoop.util.Tool;
1515
import org.apache.hadoop.util.ToolRunner;
1616

17+
/**
18+
* The driver of the web finder sets up the distributed computation by
19+
* defining what the mapper and reducer classes, amongst other things.
20+
*/
1721
public class WebFinderDriver extends Configured implements Tool {
1822

1923
public static void main(final String[] args) throws Exception {
@@ -27,6 +31,7 @@ public static void main(final String[] args) throws Exception {
2731
}
2832
}
2933

34+
/** Setting up the computation. */
3035
@Override
3136
public int run(final String[] args) throws Exception {
3237
final Configuration conf;

hadoop/webFinder/src/main/java/webFinder/WebFinderMapper.java

+17-7
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,22 @@
1313
import org.apache.hadoop.mapreduce.Mapper;
1414
import org.apache.log4j.Logger;
1515

16-
public class WebFinderMapper
17-
extends Mapper<LongWritable, Text, Text, Text> {
16+
/**
17+
* This is the Mapper component of the Web Finder example. Its input are
18+
* text lines, where each line stands for a website URL. It finds all
19+
* resources that are loaded by a given website URL and emits tuples of
20+
* kind {@code <resource URL, website URL>}.
21+
*/
22+
public class WebFinderMapper extends
23+
Mapper<LongWritable, Text, Text, Text> {
1824

1925
/** the logger we use */
2026
private static Logger LOGGER = Logger.getLogger(WebFinderMapper.class);
2127

28+
/**
29+
* Map tuples of type {@code <line number, website url text>} to tuples
30+
* of kind {@code <resource url text, website url text>}.
31+
*/
2232
@Override
2333
protected void map(final LongWritable offset, final Text line,
2434
final Context context) throws IOException, InterruptedException {
@@ -171,8 +181,8 @@ private static final void __load(final int remainingDepth,
171181
error.addSuppressed(error2);
172182
error.addSuppressed(error3);
173183
if (WebFinderMapper.LOGGER != null) {
174-
WebFinderMapper.LOGGER
175-
.warn("Error while trying to build URL with string '"
184+
WebFinderMapper.LOGGER.warn(
185+
"Error while trying to build URL with string '"
176186
+ test + "' under load URL '"
177187
+ loadUrl.toString() + "' for base URL '"
178188
+ baseUrl.toString() + "'.", error2);
@@ -215,8 +225,8 @@ private static final void __load(final int remainingDepth,
215225
}
216226
} catch (final Throwable error) {
217227
if (WebFinderMapper.LOGGER != null) {
218-
WebFinderMapper.LOGGER.warn(
219-
"Error while trying to load URL '" + loadUrl + "'.", error);
228+
WebFinderMapper.LOGGER.warn("Error while trying to load URL '"
229+
+ loadUrl + "'.", error);
220230
}
221231
}
222232

@@ -314,7 +324,7 @@ public static final void main(final String[] args) throws Throwable {
314324

315325
/** the link descriptions */
316326
static final __LinkDesc[] DESCS = { //
317-
new __LinkDesc(false, "<link rel=\"stylesheet\"", "href="), //
327+
new __LinkDesc(false, "<link rel=\"stylesheet\"", "href="), //
318328
new __LinkDesc(false, "<link rel='stylesheet'", "href="), //
319329
new __LinkDesc(false, "<img", "src="), //
320330
new __LinkDesc(false, "<script", "src="), //

hadoop/webFinder/src/main/java/webFinder/WebFinderReducer.java

+16-2
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,23 @@
1111
import org.apache.hadoop.io.Text;
1212
import org.apache.hadoop.mapreduce.Reducer;
1313

14-
public class WebFinderReducer
15-
extends Reducer<Text, Text, Text, List<Text>> {
14+
/**
15+
* This is the reducer component of the web finder example. For each key (
16+
* {@code resource URL}) of the tuples produced by the mapper, it receives
17+
* the list of all values ({@code website URLs}). If such a list contains
18+
* more than one unique element, this means that the resource is shared by
19+
* multiple websites. This reducer emits tuples of the form
20+
* {@code <resource URL, list of website urls>}.
21+
*/
22+
public class WebFinderReducer extends
23+
Reducer<Text, Text, Text, List<Text>> {
1624

25+
/**
26+
* The actual reduction step: From the tuples of form
27+
* {@code <resource URL, iterable of referencing website URLs>}, select
28+
* all resources referenced by more than one unique website. For these,
29+
* output tuples of the form {@code <resource URL, list of website URLs>}.
30+
*/
1731
@Override
1832
protected void reduce(final Text key, final Iterable<Text> values,
1933
final Context context) throws IOException, InterruptedException {

0 commit comments

Comments
 (0)