openzim · data-man · Jan 13, 2022 · Jan 13, 2022 · Jan 29, 2022 · Feb 17, 2022
diff --git a/examples/createZimExample.cpp b/examples/createZimExample.cpp
@@ -24,6 +24,7 @@
 #include <zim/writer/contentProvider.h>
 #include <zim/writer/creator.h>
 #include <zim/blob.h>
+#include <zim/zim.h>
 
 class TestItem : public zim::writer::Item
 {
@@ -75,7 +76,7 @@ int main(int argc, char* argv[])
   unsigned max = 16;
   try {
     zim::writer::Creator c;
-    c.configVerbose(false).configCompression(zim::Compression::Zstd);
+    c.configVerbose(false).configCompression(zim::Compression::Zstd, static_cast<int>(zim::ZSTDCompressionLevel::DEFAULT));
     c.startZimCreation("foo.zim");
     for (unsigned n = 0; n < max; ++n)
     {

diff --git a/include/zim/writer/creator.h b/include/zim/writer/creator.h
@@ -72,7 +72,7 @@ namespace zim
          * @param comptype the compression algorithm to use.
          * @return a reference to itself.
          */
-        Creator& configCompression(Compression compression);
+        Creator& configCompression(Compression compression, int compression_level);
 
         /**
          * Set the size of the created clusters.
@@ -205,6 +205,7 @@ namespace zim
         // configuration
         bool m_verbose = false;
         Compression m_compression = Compression::Zstd;
+        int m_compressionLevel = 0;
         bool m_withIndex = false;
         size_t m_clusterSize;
         std::string m_indexingLanguage;

diff --git a/include/zim/zim.h b/include/zim/zim.h
@@ -60,6 +60,18 @@ namespace zim
     Zstd = 5
   };
 
+  enum class LZMACompressionLevel: int {
+    MINIMUM = 0,
+    MAXIMUM = 9,
+    DEFAULT = MAXIMUM
+  };
+
+  enum class ZSTDCompressionLevel: int {
+    MINIMUM = -21,
+    MAXIMUM = 19,
+    DEFAULT = MAXIMUM
+  };
+
   static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate";
 
   enum class IntegrityCheck

diff --git a/src/compression.cpp b/src/compression.cpp
@@ -26,22 +26,29 @@
 #include <stdexcept>
 
 const std::string LZMA_INFO::name = "lzma";
-void LZMA_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
+
+void LZMA_INFO::init_stream_encoder(stream_t* stream, int compression_level, char* raw_data)
 {
   *stream = LZMA_STREAM_INIT;
-  unsigned memsize = zim::envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024);
-  auto errcode = lzma_stream_decoder(stream, memsize, 0);
+  int cl = compression_level;
+
+  if (cl == static_cast<int>(zim::LZMACompressionLevel::MAXIMUM)) {
+    cl |= LZMA_PRESET_EXTREME;
+  }
+
+  auto errcode = lzma_easy_encoder(stream, cl, LZMA_CHECK_CRC32);
   if (errcode != LZMA_OK) {
-    throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream");
+    throw std::runtime_error("Cannot initialize lzma_easy_encoder");
   }
 }
 
-void LZMA_INFO::init_stream_encoder(stream_t* stream, char* raw_data)
+void LZMA_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
 {
   *stream = LZMA_STREAM_INIT;
-  auto errcode = lzma_easy_encoder(stream, 9 | LZMA_PRESET_EXTREME, LZMA_CHECK_CRC32);
+  unsigned memsize = zim::envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024);
+  auto errcode = lzma_stream_decoder(stream, memsize, 0);
   if (errcode != LZMA_OK) {
-    throw std::runtime_error("Cannot initialize lzma_easy_encoder");
+    throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream");
   }
 }
 
@@ -103,21 +110,21 @@ ZSTD_INFO::stream_t::~stream_t()
     ::ZSTD_freeDStream(decoder_stream);
 }
 
-void ZSTD_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
+void ZSTD_INFO::init_stream_encoder(stream_t* stream, int compression_level, char* raw_data)
 {
-  stream->decoder_stream = ::ZSTD_createDStream();
-  auto ret = ::ZSTD_initDStream(stream->decoder_stream);
+  stream->encoder_stream = ::ZSTD_createCStream();
+  auto ret = ::ZSTD_initCStream(stream->encoder_stream, compression_level);
   if (::ZSTD_isError(ret)) {
-    throw std::runtime_error("Failed to initialize Zstd decompression");
+    throw std::runtime_error("Failed to initialize Zstd compression");
   }
 }
 
-void ZSTD_INFO::init_stream_encoder(stream_t* stream, char* raw_data)
+void ZSTD_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
 {
-  stream->encoder_stream = ::ZSTD_createCStream();
-  auto ret = ::ZSTD_initCStream(stream->encoder_stream, 19);
+  stream->decoder_stream = ::ZSTD_createDStream();
+  auto ret = ::ZSTD_initDStream(stream->decoder_stream);
   if (::ZSTD_isError(ret)) {
-    throw std::runtime_error("Failed to initialize Zstd compression");
+    throw std::runtime_error("Failed to initialize Zstd decompression");
   }
 }
 

diff --git a/src/compression.h b/src/compression.h
@@ -59,8 +59,8 @@ enum class RunnerStatus {
 struct LZMA_INFO {
   typedef lzma_stream stream_t;
   static const std::string name;
+  static void init_stream_encoder(stream_t* stream, int compression_level, char* raw_data);
   static void init_stream_decoder(stream_t* stream, char* raw_data);
-  static void init_stream_encoder(stream_t* stream, char* raw_data);
   static CompStatus stream_run_encode(stream_t* stream, CompStep step);
   static CompStatus stream_run_decode(stream_t* stream, CompStep step);
   static CompStatus stream_run(stream_t* stream, CompStep step);
@@ -89,8 +89,8 @@ struct ZSTD_INFO {
   };
 
   static const std::string name;
+  static void init_stream_encoder(stream_t* stream, int compression_level, char* raw_data);
   static void init_stream_decoder(stream_t* stream, char* raw_data);
-  static void init_stream_encoder(stream_t* stream, char* raw_data);
   static CompStatus stream_run_encode(stream_t* stream, CompStep step);
   static CompStatus stream_run_decode(stream_t* stream, CompStep step);
   static void stream_end_encode(stream_t* stream);
@@ -233,8 +233,8 @@ class Compressor
 
     ~Compressor() = default;
 
-    void init(char* data) {
-      INFO::init_stream_encoder(&stream, data);
+    void init(int compression_level, char * data) {
+      INFO::init_stream_encoder(&stream, compression_level, data);
       stream.next_out = (uint8_t*)ret_data.get();
       stream.avail_out = ret_size;
     }

diff --git a/src/writer/cluster.cpp b/src/writer/cluster.cpp
@@ -25,6 +25,7 @@
 #include "../debug.h"
 #include "../compression.h"
 
+#include <zim/zim.h>
 #include <zim/writer/contentProvider.h>
 
 #include <sstream>
@@ -45,8 +46,9 @@ const zim::size_type MAX_WRITE_SIZE(4UL*1024*1024*1024-1);
 namespace zim {
 namespace writer {
 
-Cluster::Cluster(Compression compression)
+Cluster::Cluster(Compression compression, int compression_level)
   : compression(compression),
+    compressionLevel(compression_level),
     isExtended(false),
     _size(0)
 {
@@ -152,7 +154,7 @@ void Cluster::_compress()
   bool first = true;
   auto writer = [&](const Blob& data) -> void {
     if (first) {
-      runner.init((char*)data.data());
+      runner.init(compressionLevel, (char*)data.data());
       first = false;
     }
     runner.feed(data.data(), data.size());

diff --git a/src/writer/cluster.h b/src/writer/cluster.h
@@ -44,10 +44,11 @@ class Cluster {
 
 
   public:
-    Cluster(Compression compression);
+    Cluster(Compression compression, int compression_level);
     virtual ~Cluster();
 
     void setCompression(Compression c) { compression = c; }
+    void setCompressionLevel(int cl) { compressionLevel = cl; }
     Compression getCompression() const { return compression; }
 
     void addContent(std::unique_ptr<ContentProvider> provider);
@@ -78,6 +79,7 @@ class Cluster {
 
   protected:
     Compression compression;
+    int compressionLevel;
     cluster_index_t index;
     bool isExtended;
     Offsets blobOffsets;

diff --git a/src/writer/creator.cpp b/src/writer/creator.cpp
@@ -110,14 +110,15 @@ namespace zim
       return *this;
     }
 
-    Creator& Creator::configCompression(Compression compression)
+    Creator& Creator::configCompression(Compression compression, int compressionLevel)
     {
       if(compression == Compression::Lzma) {
         std::cerr << "WARNING: LZMA compression method is deprecated."
                   << " Support for it will be dropped from libzim soon."
                   << std::endl;
       }
       m_compression = compression;
+      m_compressionLevel = compressionLevel;
       return *this;
     }
 
@@ -143,7 +144,7 @@ namespace zim
     void Creator::startZimCreation(const std::string& filepath)
     {
       data = std::unique_ptr<CreatorData>(
-        new CreatorData(filepath, m_verbose, m_withIndex, m_indexingLanguage, m_compression, m_clusterSize)
+        new CreatorData(filepath, m_verbose, m_withIndex, m_indexingLanguage, m_compression, m_compressionLevel, m_clusterSize)
       );
 
       for(unsigned i=0; i<m_nbWorkers; i++)
@@ -394,9 +395,11 @@ namespace zim
                                    bool withIndex,
                                    std::string language,
                                    Compression c,
+                                   int compression_level,
                                    size_t clusterSize)
       : mainPageDirent(nullptr),
         compression(c),
+        compressionLevel(compression_level),
         zimName(fname),
         tmpFileName(fname + ".tmp"),
         clusterSize(clusterSize),
@@ -435,8 +438,8 @@ namespace zim
       // because we don't know which one will fill up first.  We also need
       // to track the dirents currently in each, so we can fix up the
       // cluster index if the other one ends up written first.
-      compCluster = new Cluster(compression);
-      uncompCluster = new Cluster(Compression::None);
+      compCluster = new Cluster(compression, compression_level);
+      uncompCluster = new Cluster(Compression::None, 0);
 
 #if defined(ENABLE_XAPIAN)
       auto xapianIndexer = std::make_shared<XapianHandler>(this, withIndex);
@@ -591,9 +594,9 @@ namespace zim
 
       if (compressed)
       {
-        cluster = compCluster = new Cluster(compression);
+        cluster = compCluster = new Cluster(compression, compressionLevel);
       } else {
-        cluster = uncompCluster = new Cluster(Compression::None);
+        cluster = uncompCluster = new Cluster(Compression::None, 0);
       }
       return cluster;
     }

diff --git a/src/writer/creatordata.h b/src/writer/creatordata.h
@@ -65,6 +65,7 @@ namespace zim
         CreatorData(const std::string& fname, bool verbose,
                        bool withIndex, std::string language,
                        Compression compression,
+                       int compressionLevel,
                        size_t clusterSize);
         virtual ~CreatorData();
 
@@ -102,6 +103,7 @@ namespace zim
         ThreadList workerThreads;
         std::thread  writerThread;
         const Compression compression;
+        int compressionLevel;
         std::string zimName;
         std::string tmpFileName;
         bool isEmpty = true;

diff --git a/test/cluster.cpp b/test/cluster.cpp
@@ -66,7 +66,7 @@ using zim::unittests::write_to_buffer;
 
 TEST(ClusterTest, create_cluster)
 {
-  zim::writer::Cluster cluster(zim::Compression::None);
+  zim::writer::Cluster cluster(zim::Compression::None, 0);
 
   ASSERT_EQ(cluster.count().v, 0U);
 
@@ -86,7 +86,7 @@ TEST(ClusterTest, create_cluster)
 
 TEST(ClusterTest, read_write_cluster)
 {
-  zim::writer::Cluster cluster(zim::Compression::None);
+  zim::writer::Cluster cluster(zim::Compression::None, 0);
 
   std::string blob0("123456789012345678901234567890");
   std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
@@ -110,7 +110,7 @@ TEST(ClusterTest, read_write_cluster)
 
 TEST(ClusterTest, read_write_no_content)
 {
-  zim::writer::Cluster cluster(zim::Compression::None);
+  zim::writer::Cluster cluster(zim::Compression::None, 0);
 
   cluster.close();
   auto buffer = write_to_buffer(cluster, "\3garbage");
@@ -123,7 +123,7 @@ TEST(ClusterTest, read_write_no_content)
 
 TEST(ClusterTest, read_write_empty)
 {
-  zim::writer::Cluster cluster(zim::Compression::None);
+  zim::writer::Cluster cluster(zim::Compression::None, 0);
 
   std::string emptyString;
 
@@ -145,7 +145,7 @@ TEST(ClusterTest, read_write_empty)
 
 TEST(ClusterTest, read_write_clusterLzma)
 {
-  zim::writer::Cluster cluster(zim::Compression::Lzma);
+  zim::writer::Cluster cluster(zim::Compression::Lzma, static_cast<int>(zim::LZMACompressionLevel::DEFAULT));
 
   std::string blob0("123456789012345678901234567890");
   std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
@@ -172,7 +172,7 @@ TEST(ClusterTest, read_write_clusterLzma)
 
 TEST(ClusterTest, read_write_clusterZstd)
 {
-  zim::writer::Cluster cluster(zim::Compression::Zstd);
+  zim::writer::Cluster cluster(zim::Compression::Zstd, static_cast<int>(zim::ZSTDCompressionLevel::DEFAULT));
 
   std::string blob0("123456789012345678901234567890");
   std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
@@ -244,7 +244,7 @@ TEST(ClusterTest, read_write_extended_cluster)
   auto bigProvider = std::unique_ptr<zim::writer::ContentProvider>(new FakeProvider(almost_4g));
   std::string blob4("zyxwvutsrqponmlkjihgfedcba");
 
-  zim::writer::Cluster cluster(zim::Compression::None);
+  zim::writer::Cluster cluster(zim::Compression::None, 0);
   cluster.addContent(blob0);
   cluster.addContent(blob1);
   cluster.addContent(blob2);

diff --git a/test/compression.cpp b/test/compression.cpp
@@ -61,7 +61,7 @@ TYPED_TEST(CompressionTest, compress) {
         size_t offset = 0;
         while (size) {
           if (first) {
-            compressor.init(const_cast<char*>(data.c_str()));
+            compressor.init(9, const_cast<char*>(data.c_str()));
             first = false;
           }
           auto adjustedChunkSize = std::min(size, chunkSize);

diff --git a/test/decoderstreamreader.cpp b/test/decoderstreamreader.cpp
@@ -30,7 +30,7 @@ std::string
 compress(const std::string& data)
 {
   zim::Compressor<CompressionInfo> compressor(data.size());
-  compressor.init(const_cast<char*>(data.c_str()));
+  compressor.init(1, const_cast<char*>(data.c_str()));
   compressor.feed(data.c_str(), data.size());
   zim::zsize_t comp_size;
   const auto comp_data = compressor.get_data(&comp_size);

diff --git a/test/dirent.cpp b/test/dirent.cpp
@@ -106,7 +106,7 @@ TEST(DirentTest, set_get_data_dirent)
 TEST(DirentTest, read_write_article_dirent)
 {
   zim::writer::Dirent dirent(NS::C, "Bar", "Foo", 17);
-  zim::writer::Cluster cluster(zim::Compression::None);
+  zim::writer::Cluster cluster(zim::Compression::None, 0);
   cluster.addContent(""); // Add a dummy content
   cluster.setClusterIndex(zim::cluster_index_t(45));
   dirent.setCluster(&cluster);
@@ -134,7 +134,7 @@ TEST(DirentTest, read_write_article_dirent)
 TEST(DirentTest, read_write_article_dirent_unicode)
 {
   zim::writer::Dirent dirent(NS::C, "L\xc3\xbcliang", "", 17);
-  zim::writer::Cluster cluster(zim::Compression::None);
+  zim::writer::Cluster cluster(zim::Compression::None, 0);
   cluster.addContent(""); // Add a dummy content
   cluster.setClusterIndex(zim::cluster_index_t(45));
   dirent.setCluster(&cluster);