Skip to content

Commit e8f1d60

Browse files
tsv-filter: Add an output buffer. Performance enhancement with narrow inputs with match rates. (#115)
1 parent 25dae4e commit e8f1d60

3 files changed

Lines changed: 52 additions & 2 deletions

File tree

tsv-filter/src/tsv-filter.d

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,21 @@ void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles)
787787
import std.range;
788788
import tsvutil : throwIfWindowsNewlineOnUnix;
789789

790+
/* An output buffer. Improves performance on narrow files with high percentages of
791+
* writes. Want responsive output if output is rare, so ensure the first matched
792+
* line is written, and that writes separated by long stretches of non-matched lines
793+
* are written.
794+
*/
795+
enum bufferReserveSize = 11264;
796+
enum bufferFlushSize = 10240;
797+
enum maxInputLinesWithoutBufferFlush = 1024;
798+
size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1;
799+
800+
auto outputBuffer = appender!(char[]);
801+
outputBuffer.reserve(bufferReserveSize);
802+
803+
scope(exit) if (outputBuffer.data.length > 0) write(outputBuffer.data);
804+
790805
/* Process each input file, one line at a time. */
791806
auto lineFields = new char[][](cmdopt.maxFieldIndex + 1);
792807
bool headerWritten = false;
@@ -801,7 +816,8 @@ void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles)
801816
/* Header. Output on the first file, skip subsequent files. */
802817
if (!headerWritten)
803818
{
804-
writeln(line);
819+
outputBuffer.put(line);
820+
outputBuffer.put('\n');
805821
headerWritten = true;
806822
}
807823
}
@@ -840,11 +856,23 @@ void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles)
840856
*/
841857
try
842858
{
859+
inputLinesWithoutBufferFlush++;
843860
bool passed = cmdopt.disjunct ?
844861
cmdopt.tests.any!(x => x(lineFields)) :
845862
cmdopt.tests.all!(x => x(lineFields));
846863
if (cmdopt.invert) passed = !passed;
847-
if (passed) writeln(line);
864+
if (passed)
865+
{
866+
outputBuffer.put(line);
867+
outputBuffer.put('\n');
868+
if (outputBuffer.data.length >= bufferFlushSize ||
869+
inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush)
870+
{
871+
write(outputBuffer.data);
872+
outputBuffer.clear;
873+
inputLinesWithoutBufferFlush = 0;
874+
}
875+
}
848876
}
849877
catch (Exception exc)
850878
{

tsv-filter/tests/gold/basic_tests_1.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1231,6 +1231,15 @@ f1 f2 f3
12311231
3x2-r2 2002 3002
12321232
3x1-r1 201 301
12331233

1234+
====[seq 100000 | tsv-filter --or --eq 1:1000 --eq 1:1100 --eq 1:5000 --eq 1:10000 --ge 1:70000 | wc -l | tr -d ' ']====
1235+
30005
1236+
1237+
====[seq 100000 | tsv-filter --le 1:20 | wc -l | tr -d ' ']====
1238+
20
1239+
1240+
====[seq 100000 | tsv-filter --or --le 1:20 --eq 1:50000 --eq 1:50001 | wc -l | tr -d ' ']====
1241+
22
1242+
12341243
Help and Version printing 1
12351244
-----------------
12361245

tsv-filter/tests/tests.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,19 @@ cat input_3x2.tsv | ${prog} --header --ge 2:23 >> ${basic_tests_1} 2>&1
231231
echo "" >> ${basic_tests_1}; echo "====[cat input_3x2.tsv | tsv-filter --header --ge 2:23 -- input_3x3.tsv - input_3x1.tsv]====" >> ${basic_tests_1}
232232
cat input_3x2.tsv | ${prog} --header --ge 2:23 -- input_3x3.tsv - input_3x1.tsv >> ${basic_tests_1} 2>&1
233233

234+
## These tests are for the output buffering cases.
235+
echo "" >> ${basic_tests_1};
236+
echo "====[seq 100000 | tsv-filter --or --eq 1:1000 --eq 1:1100 --eq 1:5000 --eq 1:10000 --ge 1:70000 | wc -l | tr -d ' ']====" >> ${basic_tests_1}
237+
seq 100000 | ${prog} --or --eq 1:1000 --eq 1:1100 --eq 1:5000 --eq 1:10000 --ge 1:70000 | wc -l | tr -d ' ' >> ${basic_tests_1} 2>&1
238+
239+
echo "" >> ${basic_tests_1};
240+
echo "====[seq 100000 | tsv-filter --le 1:20 | wc -l | tr -d ' ']====" >> ${basic_tests_1}
241+
seq 100000 | ${prog} --le 1:20 | wc -l | tr -d ' ' >> ${basic_tests_1} 2>&1
242+
243+
echo "" >> ${basic_tests_1};
244+
echo "====[seq 100000 | tsv-filter --or --le 1:20 --eq 1:50000 --eq 1:50001 | wc -l | tr -d ' ']====" >> ${basic_tests_1}
245+
seq 100000 | ${prog} --or --le 1:20 --eq 1:50000 --eq 1:50001 | wc -l | tr -d ' ' >> ${basic_tests_1} 2>&1
246+
234247
## Help and Version printing
235248

236249
echo "" >> ${basic_tests_1}

0 commit comments

Comments
 (0)