From 5bc4b276f537aa11eecae3807c5a6f908e37a6b2 Mon Sep 17 00:00:00 2001 From: BenjaminTrapani Date: Thu, 7 Mar 2019 18:47:30 -0800 Subject: [PATCH 1/2] Cache previous parsed text sequence rows in memory to improve perf for large text files --- Source/Readers/CNTKTextFormatReader/TextParser.cpp | 8 ++++++++ Source/Readers/CNTKTextFormatReader/TextParser.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/Source/Readers/CNTKTextFormatReader/TextParser.cpp b/Source/Readers/CNTKTextFormatReader/TextParser.cpp index 009e94be023f..eceaa93cb22d 100644 --- a/Source/Readers/CNTKTextFormatReader/TextParser.cpp +++ b/Source/Readers/CNTKTextFormatReader/TextParser.cpp @@ -307,6 +307,12 @@ typename TextParser::SequenceBuffer TextParser::LoadSequence { size_t fileOffset = sequenceDsc.OffsetInChunk() + chunkOffsetInFile; + auto cachedSequencePos = m_fileOffsetToSequenceBuffer.find(fileOffset); + if (cachedSequencePos != m_fileOffsetToSequenceBuffer.end()) + { + return cachedSequencePos->second; + } + m_fileReader->SetFileOffset(fileOffset); size_t bytesToRead = sequenceDsc.SizeInBytes(); @@ -430,6 +436,8 @@ typename TextParser::SequenceBuffer TextParser::LoadSequence } FillSequenceMetadata(sequence, { sequenceDsc.m_key, 0 }); + + m_fileOffsetToSequenceBuffer[fileOffset] = sequence; return sequence; } diff --git a/Source/Readers/CNTKTextFormatReader/TextParser.h b/Source/Readers/CNTKTextFormatReader/TextParser.h index abf476a92922..3c4459f4ae60 100644 --- a/Source/Readers/CNTKTextFormatReader/TextParser.h +++ b/Source/Readers/CNTKTextFormatReader/TextParser.h @@ -10,6 +10,7 @@ #include "TextConfigHelper.h" #include "Index.h" #include "CorpusDescriptor.h" +#include namespace CNTK { @@ -135,6 +136,7 @@ class TextParser : public DataDeserializerBase { bool m_cacheIndex; unsigned int m_numRetries; // specifies the number of times an unsuccessful // file operation should be repeated (default value is 5). + std::unordered_map m_fileOffsetToSequenceBuffer; // Corpus descriptor. CorpusDescriptorPtr m_corpus; From 1b7fa257d32d72e8e9cdd3155d518e1c32f48fe3 Mon Sep 17 00:00:00 2001 From: Benjamin Trapani Date: Sun, 10 Mar 2019 22:42:50 -0700 Subject: [PATCH 2/2] Convert tabs to spaces Make indentation consistent with that used in the rest of the file. --- Source/Readers/CNTKTextFormatReader/TextParser.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Source/Readers/CNTKTextFormatReader/TextParser.cpp b/Source/Readers/CNTKTextFormatReader/TextParser.cpp index eceaa93cb22d..964c172c3b60 100644 --- a/Source/Readers/CNTKTextFormatReader/TextParser.cpp +++ b/Source/Readers/CNTKTextFormatReader/TextParser.cpp @@ -307,11 +307,11 @@ typename TextParser::SequenceBuffer TextParser::LoadSequence { size_t fileOffset = sequenceDsc.OffsetInChunk() + chunkOffsetInFile; - auto cachedSequencePos = m_fileOffsetToSequenceBuffer.find(fileOffset); - if (cachedSequencePos != m_fileOffsetToSequenceBuffer.end()) - { + auto cachedSequencePos = m_fileOffsetToSequenceBuffer.find(fileOffset); + if (cachedSequencePos != m_fileOffsetToSequenceBuffer.end()) + { return cachedSequencePos->second; - } + } m_fileReader->SetFileOffset(fileOffset);