Skip to content

Commit 2447e3b

Browse files
tdomhanTobias Domhan
authored and
Tobias Domhan
committed
Refactoring of sequence file iterator.
1 parent 0d766be commit 2447e3b

File tree

3 files changed

+142
-71
lines changed

3 files changed

+142
-71
lines changed

src/edu/jhu/thrax/lexprob/SequenceFileLexprobTable.java

Lines changed: 20 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,18 @@
22

33
import java.io.IOException;
44
import java.net.URI;
5+
import java.util.Arrays;
56
import java.util.Iterator;
7+
import java.util.stream.Collectors;
68

79
import org.apache.hadoop.conf.Configuration;
810
import org.apache.hadoop.fs.FileStatus;
911
import org.apache.hadoop.fs.FileSystem;
1012
import org.apache.hadoop.fs.Path;
11-
import org.apache.hadoop.io.FloatWritable;
12-
import org.apache.hadoop.io.LongWritable;
1313
import org.apache.hadoop.io.SequenceFile;
1414

15+
import edu.jhu.thrax.util.ChainedIterators;
16+
1517
/**
1618
* A base class for lexical probability tables that will be read from a Hadoop sequence file that is
1719
* held on disk. This class serves to hide all the horrible Hadoop filesystem plumbing from more
@@ -30,14 +32,15 @@ public SequenceFileLexprobTable(Configuration conf, String fileGlob) throws IOEx
3032
fs = FileSystem.get(uri, conf);
3133
files = fs.globStatus(new Path(fileGlob));
3234
if (files.length == 0) throw new IOException("no files found in lexprob glob:" + fileGlob);
35+
Arrays.sort(files); // some implementations (like local FS) don't return a sorted list of files
3336
}
3437

3538
protected abstract void initialize(Iterable<TableEntry> entries);
3639

3740
public abstract float get(int car, int cdr);
3841

3942
public abstract boolean contains(int car, int cdr);
40-
43+
4144
/**
4245
* Return an Iterable that will range over all the entries in a series of globbed files.
4346
*
@@ -46,77 +49,23 @@ public SequenceFileLexprobTable(Configuration conf, String fileGlob) throws IOEx
4649
* @param files an array of FileStatus from getGlobStatus
4750
* @return an Iterable over all entries in all files in the files glob
4851
*/
49-
protected static Iterable<TableEntry> getSequenceFileIterator(FileSystem theFS,
52+
protected static Iterable<TableEntry> getSequenceFileIterator(FileSystem fs,
5053
Configuration conf, FileStatus[] files) {
51-
final LongWritable pair = new LongWritable();
52-
final FloatWritable d = new FloatWritable(0.0f);
53-
final FileStatus[] theFiles = files;
54-
final Configuration theConf = conf;
55-
final FileSystem fs = theFS;
56-
57-
final Iterator<TableEntry> iterator = new Iterator<TableEntry>() {
58-
int fileIndex = 0;
59-
TableEntry lookahead = null;
60-
SequenceFile.Reader reader = null;
61-
62-
public boolean hasNext() {
63-
try {
64-
// if we've already peeked at the next entry, it can be
65-
// returned
66-
if (lookahead != null) return true;
67-
// if the reader is null, we haven't looked at a single
68-
// file yet, so set the reader to read the first file
69-
if (reader == null) reader = new SequenceFile.Reader(fs, theFiles[0].getPath(), theConf);
70-
// reader is not null here, so try to read an entry
71-
boolean gotNew = reader.next(pair, d);
72-
if (gotNew) {
73-
// there was something to read
74-
lookahead = new TableEntry(pair, d);
75-
return true;
76-
}
77-
fileIndex++;
78-
// else, move to the next file
79-
// but if there are no more, return false
80-
if (fileIndex >= theFiles.length) return false;
81-
reader.close();
82-
reader = new SequenceFile.Reader(fs, theFiles[fileIndex].getPath(), theConf);
83-
// new file, so try again
84-
gotNew = reader.next(pair, d);
85-
if (gotNew) {
86-
lookahead = new TableEntry(pair, d);
87-
return true;
88-
}
89-
return false;
90-
} catch (IOException e) {
91-
throw new IllegalArgumentException(e);
92-
}
93-
}
94-
95-
public TableEntry next() {
96-
try {
97-
// return the lookahead, if possible
98-
if (lookahead != null) {
99-
TableEntry val = lookahead;
100-
lookahead = null;
101-
return val;
102-
}
103-
boolean gotNew = reader.next(pair, d);
104-
if (gotNew)
105-
return new TableEntry(pair, d);
106-
else
107-
return null;
108-
} catch (IOException e) {
109-
throw new IllegalArgumentException();
110-
}
111-
}
112-
113-
public void remove() {
114-
throw new UnsupportedOperationException();
115-
}
116-
};
11754
return new Iterable<TableEntry>() {
55+
56+
@Override
11857
public Iterator<TableEntry> iterator() {
119-
return iterator;
58+
Iterator<? extends Iterator<TableEntry>> fileIterators = Arrays.asList(files).stream()
59+
.map(file -> {
60+
try {
61+
return new SequenceFile.Reader(fs, file.getPath(), conf);
62+
} catch (IOException e) {
63+
throw new RuntimeException(e);
64+
}
65+
})
66+
.map(seqFile -> new SequenceFileTableEntryIterator(seqFile))
67+
.collect(Collectors.toList()).iterator();
68+
return new ChainedIterators<TableEntry>(fileIterators);
12069
}
12170
};
12271
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package edu.jhu.thrax.lexprob;
2+
3+
import java.io.IOException;
4+
import java.util.Iterator;
5+
import java.util.NoSuchElementException;
6+
import java.util.Optional;
7+
8+
import org.apache.hadoop.io.FloatWritable;
9+
import org.apache.hadoop.io.LongWritable;
10+
import org.apache.hadoop.io.SequenceFile;
11+
12+
public class SequenceFileTableEntryIterator implements Iterator<TableEntry> {
13+
14+
private final SequenceFile.Reader reader;
15+
16+
private final LongWritable pair = new LongWritable();
17+
private final FloatWritable d = new FloatWritable(0.0f);
18+
19+
private Optional<TableEntry> lookahead = Optional.empty();
20+
private boolean finishedReading = false;
21+
22+
public SequenceFileTableEntryIterator(SequenceFile.Reader reader) {
23+
this.reader = reader;
24+
}
25+
26+
@Override
27+
public boolean hasNext() {
28+
if (lookahead.isPresent()) {
29+
return true;
30+
}
31+
lookahead = tryReadNext();
32+
if (lookahead.isPresent()) {
33+
return true;
34+
} else {
35+
return false;
36+
}
37+
}
38+
39+
@Override
40+
public TableEntry next() {
41+
if (!hasNext()) {
42+
throw new NoSuchElementException();
43+
}
44+
TableEntry nextEntry = lookahead.get();
45+
lookahead = Optional.empty();
46+
return nextEntry;
47+
}
48+
49+
private Optional<TableEntry> tryReadNext() {
50+
if (finishedReading) {
51+
return Optional.empty();
52+
}
53+
try {
54+
boolean gotNew = reader.next(pair, d);
55+
if (gotNew) {
56+
// there was something to read
57+
return Optional.of(new TableEntry(pair, d));
58+
} else {
59+
finishedReading = true;
60+
return Optional.empty();
61+
}
62+
} catch (IOException e) {
63+
throw new RuntimeException(e);
64+
}
65+
}
66+
67+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package edu.jhu.thrax.util;
2+
3+
import java.util.Collection;
4+
import java.util.Iterator;
5+
import java.util.NoSuchElementException;
6+
7+
public class ChainedIterators<T> implements Iterator<T> {
8+
9+
private Iterator<? extends Iterator<T>> iteratorOfIterators;
10+
private Iterator<T> currentIterator;
11+
private boolean finished = false;
12+
13+
public ChainedIterators(Iterator<? extends Iterator<T>> iteratorOfIterators) {
14+
this.iteratorOfIterators = iteratorOfIterators;
15+
moveToNextIterator();
16+
}
17+
18+
public ChainedIterators(Collection<? extends Iterator<T>> iteratorOfIterators) {
19+
this.iteratorOfIterators = iteratorOfIterators.iterator();
20+
moveToNextIterator();
21+
}
22+
23+
@Override
24+
public boolean hasNext() {
25+
if (finished) {
26+
return false;
27+
}
28+
if (currentIterator.hasNext()) {
29+
return true;
30+
} else {
31+
moveToNextIterator();
32+
return !finished;
33+
}
34+
}
35+
36+
@Override
37+
public T next() {
38+
if (!hasNext()) {
39+
throw new NoSuchElementException();
40+
}
41+
return currentIterator.next();
42+
}
43+
44+
private void moveToNextIterator() {
45+
while (iteratorOfIterators.hasNext()) {
46+
currentIterator = iteratorOfIterators.next();
47+
if (currentIterator.hasNext()) {
48+
finished = false;
49+
return;
50+
}
51+
}
52+
finished = true;
53+
}
54+
55+
}

0 commit comments

Comments
 (0)