Skip to content

Commit f2f9dac

Browse files
committed
Merge remote-tracking branch 'origin/develop'
2 parents 3eb2ecb + 5408a7d commit f2f9dac

File tree

20 files changed

+233
-80
lines changed

20 files changed

+233
-80
lines changed

.github/workflows/continuous_integration.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
strategy:
2222
# Define OS and Python versions to use. 3.x is the latest minor version.
2323
matrix:
24-
python-version: ["3.6", "3.x"] # 3.x is the latest minor version
24+
python-version: ["3.x"] # 3.x is the latest minor version
2525
os: [ubuntu-latest]
2626

2727
# Sequence of tasks for this job

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,6 @@ Create a release on GitHub using the "Auto-generate release notes" feature. http
8383

8484
Upload to PyPI
8585
```console
86-
$ python3 setup.py sdist bdist_wheel
86+
$ python3 -m build
8787
$ twine upload --sign dist/*
8888
```

madoop/__main__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ def main():
3434
'-v', '--verbose', action='count', default=0,
3535
help="verbose output"
3636
)
37+
optional_args.add_argument(
38+
'-numReduceTasks', dest='num_reducers', default=4,
39+
help="max number of reducers"
40+
)
3741
required_args = parser.add_argument_group('required arguments')
3842
required_args.add_argument('-input', dest='input', required=True)
3943
required_args.add_argument('-output', dest='output', required=True)
@@ -56,10 +60,11 @@ def main():
5660
# Run MapReduce API
5761
try:
5862
mapreduce(
59-
input_dir=args.input,
63+
input_path=args.input,
6064
output_dir=args.output,
6165
map_exe=args.mapper,
6266
reduce_exe=args.reducer,
67+
num_reducers=int(args.num_reducers)
6368
)
6469
except MadoopError as err:
6570
sys.exit(f"Error: {err}")

madoop/mapreduce.py

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@
1919
MAX_INPUT_SPLIT_SIZE = 2**20 # 1 MB
2020

2121
# The number of reducers is dynamically determined by the number of unique keys
22-
# but will not be more than MAX_NUM_REDUCE
23-
MAX_NUM_REDUCE = 4
22+
# but will not be more than num_reducers
2423

2524
# Madoop logger
2625
LOGGER = logging.getLogger("madoop")
2726

2827

29-
def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
28+
def mapreduce(input_path, output_dir, map_exe, reduce_exe, num_reducers):
3029
"""Madoop API."""
3130
# Do not clobber existing output directory
3231
output_dir = pathlib.Path(output_dir)
@@ -54,8 +53,8 @@ def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
5453
reduce_output_dir.mkdir()
5554

5655
# Copy and rename input files: part-00000, part-00001, etc.
57-
input_dir = pathlib.Path(input_dir)
58-
prepare_input_files(input_dir, map_input_dir)
56+
input_path = pathlib.Path(input_path)
57+
prepare_input_files(input_path, map_input_dir)
5958

6059
# Executables must be absolute paths
6160
map_exe = pathlib.Path(map_exe).resolve()
@@ -74,6 +73,7 @@ def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
7473
group_stage(
7574
input_dir=map_output_dir,
7675
output_dir=reduce_input_dir,
76+
num_reducers=num_reducers
7777
)
7878

7979
# Run the reducing stage
@@ -98,25 +98,23 @@ def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
9898
LOGGER.info("Output directory: %s", output_dir)
9999

100100

101-
def prepare_input_files(input_dir, output_dir):
101+
def prepare_input_files(input_path, output_dir):
102102
"""Copy and split input files. Rename to part-00000, part-00001, etc.
103103
104-
If a file in input_dir is smaller than MAX_INPUT_SPLIT_SIZE, then copy it
105-
to output_dir. For larger files, split into blocks of MAX_INPUT_SPLIT_SIZE
106-
bytes and write block to output_dir. Input files will never be combined.
104+
The input_path can be a file or a directory of files. If a file is smaller
105+
than MAX_INPUT_SPLIT_SIZE, then copy it to output_dir. For larger files,
106+
split into blocks of MAX_INPUT_SPLIT_SIZE bytes and write block to
107+
output_dir. Input files will never be combined.
107108
108109
The number of files created will be the number of mappers since we will
109110
assume that the number of tasks per mapper is 1. Apache Hadoop has a
110111
configurable number of tasks per mapper, however for both simplicity and
111112
because our use case has smaller inputs we use 1.
112113
113114
"""
114-
assert input_dir.is_dir(), f"Can't find input_dir '{input_dir}'"
115-
116-
# Split and copy input files
117115
part_num = 0
118116
total_size = 0
119-
for inpath in sorted(input_dir.glob('*')):
117+
for inpath in normalize_input_paths(input_path):
120118
assert inpath.is_file()
121119

122120
# Compute output filenames
@@ -148,6 +146,26 @@ def prepare_input_files(input_dir, output_dir):
148146
LOGGER.debug("total input size=%sB", total_size)
149147

150148

149+
def normalize_input_paths(input_path):
150+
"""Return a list of filtered input files.
151+
152+
If input_path is a file, then use it. If input_path is a directory, then
153+
grab all the *files* inside. Ignore subdirectories.
154+
155+
"""
156+
input_paths = []
157+
if input_path.is_dir():
158+
for path in sorted(input_path.glob('*')):
159+
if path.is_file():
160+
input_paths.append(path)
161+
else:
162+
LOGGER.warning("Ignoring non-file: %s", path)
163+
elif input_path.is_file():
164+
input_paths.append(input_path)
165+
assert input_paths, f"No input: {input_path}"
166+
return input_paths
167+
168+
151169
def is_executable(exe):
152170
"""Verify exe is executable and raise exception if it is not.
153171
@@ -222,37 +240,43 @@ def keyhash(key):
222240
return int(hexdigest, base=16)
223241

224242

225-
def partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats):
243+
def partition_keys(
244+
inpath,
245+
outpaths,
246+
input_keys_stats,
247+
output_keys_stats,
248+
num_reducers):
226249
"""Allocate lines of inpath among outpaths using hash of key.
227250
228251
Update the data structures provided by the caller input_keys_stats and
229252
output_keys_stats. Both map a filename to a set of of keys.
230253
231254
"""
232-
assert len(outpaths) == MAX_NUM_REDUCE
255+
assert len(outpaths) == num_reducers
233256
outparent = outpaths[0].parent
234257
assert all(i.parent == outparent for i in outpaths)
235258
with contextlib.ExitStack() as stack:
236259
outfiles = [stack.enter_context(p.open("a")) for p in outpaths]
237260
for line in stack.enter_context(inpath.open()):
238261
key = line.partition('\t')[0]
239262
input_keys_stats[inpath].add(key)
240-
reducer_idx = keyhash(key) % MAX_NUM_REDUCE
263+
reducer_idx = keyhash(key) % num_reducers
241264
outfiles[reducer_idx].write(line)
242265
outpath = outpaths[reducer_idx]
243266
output_keys_stats[outpath].add(key)
244267

245268

246-
def group_stage(input_dir, output_dir):
269+
def group_stage(input_dir, output_dir, num_reducers):
247270
"""Run group stage.
248271
249272
Process each mapper output file, allocating lines to grouper output files
250273
using the hash and modulo of the key.
251274
252275
"""
253276
# Compute output filenames
277+
LOGGER.debug("%s reducers", num_reducers)
254278
outpaths = []
255-
for i in range(MAX_NUM_REDUCE):
279+
for i in range(num_reducers):
256280
outpaths.append(output_dir/part_filename(i))
257281

258282
# Track keyspace stats, map filename -> set of keys
@@ -261,7 +285,8 @@ def group_stage(input_dir, output_dir):
261285

262286
# Partition input, appending to output files
263287
for inpath in sorted(input_dir.iterdir()):
264-
partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats)
288+
partition_keys(inpath, outpaths, input_keys_stats,
289+
output_keys_stats, num_reducers)
265290

266291
# Log input keyspace stats
267292
all_input_keys = set()

pyproject.toml

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
[build-system]
2+
requires = ["setuptools>=64.0.0", "wheel"]
3+
build-backend = "setuptools.build_meta"
4+
5+
[project]
6+
name = "madoop"
7+
version = "1.1.0"
8+
description="A light weight MapReduce framework for education."
9+
license = {file = "LICENSE"}
10+
authors = [
11+
{name = "Andrew DeOrio", email = "[email protected]"}
12+
]
13+
readme = "README.md"
14+
keywords = [
15+
"madoop", "Hadoop", "MapReduce", "Michigan Hadoop", "Hadoop Streaming"
16+
]
17+
requires-python = ">=3.6"
18+
19+
[project.urls]
20+
repository = "https://github.com/eecs485staff/madoop/"
21+
documentation = "https://github.com/eecs485staff/madoop/blob/develop/README_Hadoop_Streaming.md#hadoop-streaming-in-python"
22+
23+
[project.scripts]
24+
madoop = "madoop.__main__:main"
25+
26+
[project.optional-dependencies]
27+
dev = [
28+
"pdbpp",
29+
"build",
30+
"twine",
31+
"tox",
32+
"check-manifest",
33+
"freezegun",
34+
"pycodestyle",
35+
"pydocstyle",
36+
"pylint",
37+
"pytest",
38+
"pytest-cov",
39+
]
40+
41+
[tool.setuptools.packages.find]
42+
where = ["."]
43+
include = ["madoop*"]

setup.py

Lines changed: 0 additions & 48 deletions
This file was deleted.

0 commit comments

Comments
 (0)