Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,6 @@ fastp

# Test Output
*.json
*.html
*.html

out/
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ DIR_OBJ := ./obj

PREFIX ?= /usr/local
BINDIR ?= $(PREFIX)/bin
INCLUDE_DIRS ?=
LIBRARY_DIRS ?=
INCLUDE_DIRS ?=
LIBRARY_DIRS ?=

SRC := $(wildcard ${DIR_SRC}/*.cpp)
OBJ := $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC}))
Expand All @@ -16,7 +16,7 @@ BIN_TARGET := ${TARGET}

CXX ?= g++
CXXFLAGS := -std=c++11 -pthread -g -O3 -MD -MP -I${DIR_INC} $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) ${CXXFLAGS}
LIBS := -lisal -ldeflate -lpthread
LIBS := -lisal -ldeflate -lzstd -lpthread
STATIC_FLAGS := -static -Wl,--no-as-needed -pthread
LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS) $(LD_FLAGS)
STATIC_LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(STATIC_FLAGS) $(LIBS) $(STATIC_LD_FLAGS)
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ fastp supports batch processing of multiple FASTQ files in a folder, see - [batc
11. support long reads (data from PacBio / Nanopore devices).
12. support reading from STDIN and writing to STDOUT
13. support interleaved input
14. support reading Zstandard-compressed FASTQ/FASTA files (`.zst` / `.zstd`)
14. support ultra-fast FASTQ-level deduplication
15. ...

Expand Down Expand Up @@ -158,6 +159,7 @@ sudo make install
* for PE data, you should also specify read2 input by `-I` or `--in2`, and specify read2 output by `-O` or `--out2`.
* if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering.
* the output will be gzip-compressed if its file name ends with `.gz`
* the input can be gzip-compressed (`.gz`) or Zstandard-compressed (`.zst`, `.zstd`); compression is auto-detected from the extension
## output to STDOUT
`fastp` supports streaming the passing-filter reads to STDOUT, so that it can be passed to other compressors like `bzip2`, or be passed to aligners like `bwa` and `bowtie2`.
* specify `--stdout` to enable this mode to stream output to STDOUT
Expand Down
121 changes: 115 additions & 6 deletions src/fastqreader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,24 @@ FastqReader::FastqReader(string filename, bool hasQuality, bool phred64){
mHasNoLineBreakAtEnd = false;
mGzipInputUsedBytes = 0;
mReadPool = NULL;
mUseZstd = false;
mZstdFinished = false;
mZstdStream = NULL;
mZstdInput.src = NULL;
mZstdInput.size = 0;
mZstdInput.pos = 0;
mZstdInputUsedBytes = 0;
init();
}

FastqReader::~FastqReader(){
close();
delete[] mFastqBuf;
delete[] mGzipInputBuffer;
if(mZstdStream){
ZSTD_freeDStream(mZstdStream);
mZstdStream = NULL;
}
}

bool FastqReader::hasNoLineBreakAtEnd() {
Expand All @@ -69,11 +80,13 @@ void FastqReader::setReadPool(ReadPool* rp) {


bool FastqReader::bufferFinished() {
if(mZipped) {
return eof() && mGzipState.avail_in == 0;
} else {
if(!mZipped)
return eof();
}

if(mUseZstd)
return mZstdFinished && mZstdInput.pos == mZstdInput.size;

return eof() && mGzipState.avail_in == 0;
}

void FastqReader::readToBufIgzip(){
Expand Down Expand Up @@ -139,10 +152,67 @@ void FastqReader::readToBufIgzip(){
}
}

void FastqReader::readToBufZstd(){
mBufDataLen = 0;
if(mZstdFinished)
return;

ZSTD_outBuffer outBuf;
outBuf.dst = mFastqBuf;
outBuf.size = mGzipOutputBufferSize;
outBuf.pos = 0;

while(outBuf.pos == 0){
if(mZstdInput.pos == mZstdInput.size){
size_t readBytes = fread(mGzipInputBuffer, 1, mGzipInputBufferSize, mFile);
if(readBytes == 0){
if(eof()){
mZstdFinished = true;
break;
} else {
error_exit("zstd: read error on file: " + mFilename);
}
}
mZstdInput.src = mGzipInputBuffer;
mZstdInput.size = readBytes;
mZstdInput.pos = 0;
mZstdInputUsedBytes += readBytes;
}

size_t ret = ZSTD_decompressStream(mZstdStream, &outBuf, &mZstdInput);
if(ZSTD_isError(ret)){
error_exit("zstd: decompression error for file: " + mFilename + ", error: " + string(ZSTD_getErrorName(ret)));
}

if(ret == 0){
if(mZstdInput.pos < mZstdInput.size || !eof()){
size_t resetRet = ZSTD_initDStream(mZstdStream);
if(ZSTD_isError(resetRet)){
error_exit("zstd: failed to reset stream for file: " + mFilename + ", error: " + string(ZSTD_getErrorName(resetRet)));
}
} else {
mZstdFinished = true;
}
}

if(eof() && mZstdInput.pos == mZstdInput.size && ret != 0){
error_exit("zstd: unexpected eof found in file: " + mFilename);
}

if(mZstdFinished || outBuf.pos > 0)
break;
}

mBufDataLen = outBuf.pos;
}

void FastqReader::readToBuf() {
mBufDataLen = 0;
if(mZipped) {
readToBufIgzip();
if(mUseZstd)
readToBufZstd();
else
readToBufIgzip();
} else {
if(!eof())
mBufDataLen = fread(mFastqBuf, 1, FQ_BUF_SIZE, mFile);
Expand Down Expand Up @@ -173,6 +243,26 @@ void FastqReader::init(){
}
mZipped = true;
}
else if (ends_with(mFilename, ".zst") || ends_with(mFilename, ".zstd")){
mFile = fopen(mFilename.c_str(), "rb");
if(mFile == NULL) {
error_exit("Failed to open file: " + mFilename);
}
mZstdStream = ZSTD_createDStream();
if(mZstdStream == NULL) {
error_exit("zstd: failed to allocate decompressor for file: " + mFilename);
}
size_t ret = ZSTD_initDStream(mZstdStream);
if(ZSTD_isError(ret)){
error_exit("zstd: failed to init decompressor for file: " + mFilename + ", error: " + string(ZSTD_getErrorName(ret)));
}
mZipped = true;
mUseZstd = true;
mZstdFinished = false;
mZstdInput.src = mGzipInputBuffer;
mZstdInput.size = 0;
mZstdInput.pos = 0;
}
else {
if(mFilename == "/dev/stdin") {
mFile = stdin;
Expand All @@ -189,7 +279,10 @@ void FastqReader::init(){

void FastqReader::getBytes(size_t& bytesRead, size_t& bytesTotal) {
if(mZipped) {
bytesRead = mGzipInputUsedBytes - mGzipState.avail_in;
if(mUseZstd)
bytesRead = mZstdInputUsedBytes - (mZstdInput.size - mZstdInput.pos);
else
bytesRead = mGzipInputUsedBytes - mGzipState.avail_in;
} else {
bytesRead = ftell(mFile);//mFile.tellg();
}
Expand Down Expand Up @@ -362,6 +455,22 @@ bool FastqReader::isZipFastq(string filename) {
return true;
else if (ends_with(filename, ".fa.gz"))
return true;
else if (ends_with(filename, ".fastq.zst"))
return true;
else if (ends_with(filename, ".fq.zst"))
return true;
else if (ends_with(filename, ".fastq.zstd"))
return true;
else if (ends_with(filename, ".fq.zstd"))
return true;
else if (ends_with(filename, ".fasta.zst"))
return true;
else if (ends_with(filename, ".fa.zst"))
return true;
else if (ends_with(filename, ".fasta.zstd"))
return true;
else if (ends_with(filename, ".fa.zstd"))
return true;
else
return false;
}
Expand Down
7 changes: 7 additions & 0 deletions src/fastqreader.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ SOFTWARE.
#include <iostream>
#include <fstream>
#include "igzip_lib.h"
#include <zstd.h>
#include "readpool.h"

class FastqReader{
Expand Down Expand Up @@ -61,6 +62,7 @@ class FastqReader{
void clearLineBreaks(char* line);
void readToBuf();
void readToBufIgzip();
void readToBufZstd();
bool bufferFinished();

private:
Expand All @@ -83,6 +85,11 @@ class FastqReader{
bool mHasQuality;
bool mPhred64;
ReadPool* mReadPool;
bool mUseZstd;
bool mZstdFinished;
ZSTD_DStream* mZstdStream;
ZSTD_inBuffer mZstdInput;
size_t mZstdInputUsedBytes;

};

Expand Down
Binary file added testdata/R1.fq.zst
Binary file not shown.
Binary file added testdata/R2.fq.zst
Binary file not shown.