Skip to content

Commit 03f260b

Browse files
authored
Merge pull request #66 from eecs485staff/flexible-num-reducers
Flexible num reducers
2 parents a5ded12 + 28b2733 commit 03f260b

File tree

11 files changed

+110
-9
lines changed

11 files changed

+110
-9
lines changed

madoop/__main__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ def main():
3434
'-v', '--verbose', action='count', default=0,
3535
help="verbose output"
3636
)
37+
optional_args.add_argument(
38+
'-numReduceTasks', dest='num_reducers', default=4,
39+
help="max number of reducers"
40+
)
3741
required_args = parser.add_argument_group('required arguments')
3842
required_args.add_argument('-input', dest='input', required=True)
3943
required_args.add_argument('-output', dest='output', required=True)
@@ -60,6 +64,7 @@ def main():
6064
output_dir=args.output,
6165
map_exe=args.mapper,
6266
reduce_exe=args.reducer,
67+
num_reducers=int(args.num_reducers)
6368
)
6469
except MadoopError as err:
6570
sys.exit(f"Error: {err}")

madoop/mapreduce.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@
1919
MAX_INPUT_SPLIT_SIZE = 2**20 # 1 MB
2020

2121
# The number of reducers is dynamically determined by the number of unique keys
22-
# but will not be more than MAX_NUM_REDUCE
23-
MAX_NUM_REDUCE = 4
22+
# but will not be more than num_reducers
2423

2524
# Madoop logger
2625
LOGGER = logging.getLogger("madoop")
2726

2827

29-
def mapreduce(input_path, output_dir, map_exe, reduce_exe):
28+
def mapreduce(input_path, output_dir, map_exe, reduce_exe, num_reducers):
3029
"""Madoop API."""
3130
# Do not clobber existing output directory
3231
output_dir = pathlib.Path(output_dir)
@@ -74,6 +73,7 @@ def mapreduce(input_path, output_dir, map_exe, reduce_exe):
7473
group_stage(
7574
input_dir=map_output_dir,
7675
output_dir=reduce_input_dir,
76+
num_reducers=num_reducers
7777
)
7878

7979
# Run the reducing stage
@@ -240,37 +240,43 @@ def keyhash(key):
240240
return int(hexdigest, base=16)
241241

242242

243-
def partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats):
243+
def partition_keys(
244+
inpath,
245+
outpaths,
246+
input_keys_stats,
247+
output_keys_stats,
248+
num_reducers):
244249
"""Allocate lines of inpath among outpaths using hash of key.
245250
246251
Update the data structures provided by the caller input_keys_stats and
247252
output_keys_stats. Both map a filename to a set of of keys.
248253
249254
"""
250-
assert len(outpaths) == MAX_NUM_REDUCE
255+
assert len(outpaths) == num_reducers
251256
outparent = outpaths[0].parent
252257
assert all(i.parent == outparent for i in outpaths)
253258
with contextlib.ExitStack() as stack:
254259
outfiles = [stack.enter_context(p.open("a")) for p in outpaths]
255260
for line in stack.enter_context(inpath.open()):
256261
key = line.partition('\t')[0]
257262
input_keys_stats[inpath].add(key)
258-
reducer_idx = keyhash(key) % MAX_NUM_REDUCE
263+
reducer_idx = keyhash(key) % num_reducers
259264
outfiles[reducer_idx].write(line)
260265
outpath = outpaths[reducer_idx]
261266
output_keys_stats[outpath].add(key)
262267

263268

264-
def group_stage(input_dir, output_dir):
269+
def group_stage(input_dir, output_dir, num_reducers):
265270
"""Run group stage.
266271
267272
Process each mapper output file, allocating lines to grouper output files
268273
using the hash and modulo of the key.
269274
270275
"""
271276
# Compute output filenames
277+
LOGGER.debug("%s reducers", num_reducers)
272278
outpaths = []
273-
for i in range(MAX_NUM_REDUCE):
279+
for i in range(num_reducers):
274280
outpaths.append(output_dir/part_filename(i))
275281

276282
# Track keyspace stats, map filename -> set of keys
@@ -279,7 +285,8 @@ def group_stage(input_dir, output_dir):
279285

280286
# Partition input, appending to output files
281287
for inpath in sorted(input_dir.iterdir()):
282-
partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats)
288+
partition_keys(inpath, outpaths, input_keys_stats,
289+
output_keys_stats, num_reducers)
283290

284291
# Log input keyspace stats
285292
all_input_keys = set()

tests/test_api.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,30 @@ def test_simple(tmpdir):
1313
output_dir="output",
1414
map_exe=TESTDATA_DIR/"word_count/map.py",
1515
reduce_exe=TESTDATA_DIR/"word_count/reduce.py",
16+
num_reducers=4
1617
)
1718
utils.assert_dirs_eq(
1819
TESTDATA_DIR/"word_count/correct/output",
1920
tmpdir/"output",
2021
)
2122

2223

24+
def test_2_reducers(tmpdir):
25+
"""Run a simple MapReduce job with 2 reducers."""
26+
with tmpdir.as_cwd():
27+
madoop.mapreduce(
28+
input_path=TESTDATA_DIR/"word_count/input",
29+
output_dir="output",
30+
map_exe=TESTDATA_DIR/"word_count/map.py",
31+
reduce_exe=TESTDATA_DIR/"word_count/reduce.py",
32+
num_reducers=2
33+
)
34+
utils.assert_dirs_eq(
35+
TESTDATA_DIR/"word_count/correct/output-2-reducers",
36+
tmpdir/"output",
37+
)
38+
39+
2340
def test_bash_executable(tmpdir):
2441
"""Run a MapReduce job written in Bash."""
2542
with tmpdir.as_cwd():
@@ -28,6 +45,7 @@ def test_bash_executable(tmpdir):
2845
output_dir="output",
2946
map_exe=TESTDATA_DIR/"word_count/map.sh",
3047
reduce_exe=TESTDATA_DIR/"word_count/reduce.sh",
48+
num_reducers=4
3149
)
3250
utils.assert_dirs_eq(
3351
TESTDATA_DIR/"word_count/correct/output",
@@ -43,6 +61,7 @@ def test_bad_map_exe(tmpdir):
4361
output_dir="output",
4462
map_exe=TESTDATA_DIR/"word_count/map_invalid.py",
4563
reduce_exe=TESTDATA_DIR/"word_count/reduce.py",
64+
num_reducers=4
4665
)
4766

4867

@@ -54,6 +73,7 @@ def test_missing_shebang(tmpdir):
5473
output_dir="output",
5574
map_exe=TESTDATA_DIR/"word_count/map.py",
5675
reduce_exe=TESTDATA_DIR/"word_count/reduce_invalid.py",
76+
num_reducers=4
5777
)
5878

5979

@@ -65,6 +85,7 @@ def test_empty_inputs(tmpdir):
6585
output_dir="output",
6686
map_exe=TESTDATA_DIR/"word_count/map.py",
6787
reduce_exe=TESTDATA_DIR/"word_count/reduce.py",
88+
num_reducers=4,
6889
)
6990
utils.assert_dirs_eq(
7091
TESTDATA_DIR/"word_count/correct/output",
@@ -80,6 +101,7 @@ def test_single_input_file(tmpdir):
80101
output_dir="output",
81102
map_exe=TESTDATA_DIR/"word_count/map.py",
82103
reduce_exe=TESTDATA_DIR/"word_count/reduce.py",
104+
num_reducers=4,
83105
)
84106
utils.assert_dirs_eq(
85107
TESTDATA_DIR/"word_count/correct/output",
@@ -97,6 +119,7 @@ def test_ignores_subdirs(tmpdir):
97119
output_dir="output",
98120
map_exe=TESTDATA_DIR/"word_count/map.py",
99121
reduce_exe=TESTDATA_DIR/"word_count/reduce.py",
122+
num_reducers=4
100123
)
101124
utils.assert_dirs_eq(
102125
TESTDATA_DIR/"word_count/correct/output",

tests/test_cli.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,27 @@ def test_simple(tmpdir):
4949
)
5050

5151

52+
def test_2_reducers(tmpdir):
53+
"""Run a simple MapReduce job with 2 reducers."""
54+
with tmpdir.as_cwd():
55+
subprocess.run(
56+
[
57+
"madoop",
58+
"-input", TESTDATA_DIR/"word_count/input",
59+
"-output", "output",
60+
"-mapper", TESTDATA_DIR/"word_count/map.py",
61+
"-reducer", TESTDATA_DIR/"word_count/reduce.py",
62+
"-numReduceTasks", "2",
63+
],
64+
stdout=subprocess.PIPE,
65+
check=True,
66+
)
67+
utils.assert_dirs_eq(
68+
TESTDATA_DIR/"word_count/correct/output-2-reducers",
69+
tmpdir/"output",
70+
)
71+
72+
5273
def test_verbose(tmpdir):
5374
"""Run a simple MapReduce job and verify the verbose stdout."""
5475
with tmpdir.as_cwd():

tests/test_stages.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,27 @@ def test_group_stage(tmpdir):
2323
group_stage(
2424
input_dir=TESTDATA_DIR/"word_count/correct/mapper-output",
2525
output_dir=Path(tmpdir),
26+
num_reducers=4,
2627
)
2728
utils.assert_dirs_eq(
2829
TESTDATA_DIR/"word_count/correct/grouper-output",
2930
tmpdir,
3031
)
3132

3233

34+
def test_group_stage_2_reducers(tmpdir):
35+
"""Test group stage using word count example with 2 reducers."""
36+
group_stage(
37+
input_dir=TESTDATA_DIR/"word_count/correct/mapper-output",
38+
output_dir=Path(tmpdir),
39+
num_reducers=2,
40+
)
41+
utils.assert_dirs_eq(
42+
TESTDATA_DIR/"word_count/correct/grouper-output-2-reducers",
43+
tmpdir,
44+
)
45+
46+
3347
def test_reduce_stage(tmpdir):
3448
"""Test reduce stage using word count example."""
3549
reduce_stage(
@@ -41,3 +55,16 @@ def test_reduce_stage(tmpdir):
4155
TESTDATA_DIR/"word_count/correct/reducer-output",
4256
tmpdir,
4357
)
58+
59+
60+
def test_reduce_stage_2_reducers(tmpdir):
61+
"""Test reduce stage using word count example with 2 reducers."""
62+
reduce_stage(
63+
exe=TESTDATA_DIR/"word_count/reduce.py",
64+
input_dir=TESTDATA_DIR/"word_count/correct/grouper-output-2-reducers",
65+
output_dir=Path(tmpdir),
66+
)
67+
utils.assert_dirs_eq(
68+
TESTDATA_DIR/"word_count/correct/reducer-output-2-reducers",
69+
tmpdir,
70+
)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Bye 1
2+
Goodbye 1
3+
Hadoop 1
4+
Hadoop 1
5+
World 1
6+
World 1
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Hello 1
2+
Hello 1
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Bye 1
2+
Goodbye 1
3+
Hadoop 2
4+
World 2
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Hello 2
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Bye 1
2+
Goodbye 1
3+
Hadoop 2
4+
World 2

0 commit comments

Comments
 (0)