set default numchunks to 7 + fix description of ringbuffer

Dieterbe · Dieterbe · commit f0794cf122c5 · 2017-02-01T17:04:07.000+01:00
this leaves 60min of data for all series.
+ make the description of the ringbuffer and chunk cache more nuanced.
diff --git a/docker/docker-cluster/metrictank.ini b/docker/docker-cluster/metrictank.ini
@@ -9,12 +9,14 @@ accounting-period = 5min
 ## data ##
 
 # see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
+
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
 chunkspan = 10min
 # number of raw chunks to keep in in-memory ring buffer
-# note that the chunk-cache (settings further down) is a more effective method to cache data and alleviate workload for cassandra.
-# but this allows secondary nodes to keep serving data in case the primary is not able to save data upto chunkspan*numchunks")
-numchunks = 5
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
+numchunks = 7
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
 
@@ -33,11 +35,13 @@ warm-up-period = 1h
 # settings for rollups (aggregation for archives)
 # comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
 # When running a cluster of metrictank instances, all instances should have the same agg-settings.
-# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 
 ## metric data storage in cassandra ##
diff --git a/docker/docker-dev-custom-cfg-kafka/metrictank.ini b/docker/docker-dev-custom-cfg-kafka/metrictank.ini
@@ -9,11 +9,13 @@ accounting-period = 5min
 ## data ##
 
 # see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
+
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
 chunkspan = 2min
 # number of raw chunks to keep in in-memory ring buffer
-# note that the chunk-cache (settings further down) is a more effective method to cache data and alleviate workload for cassandra.
-# but this allows secondary nodes to keep serving data in case the primary is not able to save data upto chunkspan*numchunks")
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
 numchunks = 2
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
@@ -33,11 +35,13 @@ warm-up-period = 1h
 # settings for rollups (aggregation for archives)
 # comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
 # When running a cluster of metrictank instances, all instances should have the same agg-settings.
-# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 
 ## metric data storage in cassandra ##
diff --git a/docs/config.md b/docs/config.md
@@ -36,11 +36,12 @@ accounting-period = 5min
 ```
 # see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
 chunkspan = 10min
 # number of raw chunks to keep in in-memory ring buffer
-# note that the chunk-cache (settings further down) is a more effective method to cache data and alleviate workload for cassandra.
-# but this allows secondary nodes to keep serving data in case the primary is not able to save data upto chunkspan*numchunks")
-numchunks = 5
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
+numchunks = 7
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
 # max age for a chunk before to be considered stale and to be persisted to Cassandra
@@ -56,11 +57,13 @@ warm-up-period = 1h
 # settings for rollups (aggregation for archives)
 # comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
 # When running a cluster of metrictank instances, all instances should have the same agg-settings.
-# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 ```
 
diff --git a/docs/memory-server.md b/docs/memory-server.md
@@ -9,11 +9,9 @@ It has two mechanisms to support this: the ring buffers, and the chunk-cache.  T
 
 The ring buffer is simply a list of chunks - one for each series - that holds the latest data for each series that has been ingested (or generated, for rollup series).
 You can configure how many chunks to retain (`numchunks`).
-* The main function of the ring buffers is to keep secondaries able to satisfy queries from RAM, even if the primary is not able to save its chunks instantly, or if the primary
- crashed and needs to be restarted.  Effectively, the more data in your ring buffer, the longer outages of a primary you can sustain. (up to `(numchunks-1) * chunkspan` in duration)
-* For keeping a "hot cache" of frequently accessed data, this is not necessarily an effective solution, since the same `numchunks` is applied to all raw series
-(and aggregation settings are applied to all series in the same fashion, so a given rollup frequency will have the same `numchunks` for all series)
-So unless you're confident your metrics are all subject to queries of the same timeranges, and that they are predictable, you should look at the chunk cache below.
+The ring buffer can be useful to assure data that may be needed is in memory, in these cases:
+* you know a majority of your queries hits the most recent data of a given time window (e.g. last 2 hours, last day), you know this is unlikely to change and true for the vast majority of your metrics. 
+* keep secondaries able to satisfy queries from RAM for the most recent data of cold (infrequently queried) series, even if the primary is not able to save its chunks instantly, if it crashed and needs to be restarted or if you're having a cassandra outage so that chunks can't be loaded or saved.  Note that this does not apply for hot data: data queried frequently enough (at least as frequent as their chunkspan) will be added to the chunk cache automatically (see below) and not require cassandra lookups.
 
 Note:
 * the last (current) chunk is always a "work in progress", so depending on what time it is, it may be anywhere between empty and full.
@@ -22,10 +20,16 @@ Note:
 
 Both of these make it tricky to articulate how much data is in the ringbuffer for a given series.  But `(numchunks-1) * chunkspan` is the conservative approximation which is valid in the typical case (a warmed up metrictank that's ingesting fresh data).
 
+For keeping a "hot cache" of frequently accessed data in a more flexible way, this is not an effective solution, since the same `numchunks` is applied to all raw series
+(and aggregation settings are applied to all series in the same fashion, so a given rollup frequency will have the same `numchunks` for all series)
+So unless you're confident your metrics are all subject to queries of the same timeranges, and that they are predictable, you should look at the chunk cache below.
+
 ### Chunk Cache
 
 The goal of the chunk cache is to offload as much read workload from cassandra as possible.
 Any data chunks fetched from Cassandra are added to the chunk cache.
+But also, more interestingly, chunks expired out of the ring buffers will automatically be added to the chunk cache if the chunk before it is also in the cache.
+In other words, for series we know to be "hot" (queried frequently enough so that their data is kept in the chunk cache) we will try to avoid a roundtrip to Cassandra before adding the chunks to the cache.  This can be especially useful when it takes long for the primary to save data to cassandra, or when there is a cassandra outage.
 The chunk cache has a configurable [maximum size](https://github.com/raintank/metrictank/blob/master/docs/config.md#chunk-cache),
 within that size it tries to always keep the most often queried data by using an LRU mechanism that evicts the Least Recently Used chunks.
 
@@ -92,8 +96,8 @@ We plan to keep working on performance and memory management and hope to make th
 
 In principle, you need just 1 chunk for each series.
 However:
-* when the data stream moves into a new chunk, secondary nodes would drop the previous chunk and query Cassandra. But the primary needs some time to save the chunk to Cassandra.  Based on your deployment this could take anywhere between milliseconds or many minutes. As you don't want to slam Cassandra with requests at each chunk clear, you should probably use a numchunks of 2, or a numchunks that lets you retain data in memory for however long it takes to flush data to cassandra.
-* The ringbuffers are a great tool to let you deal with crashes or outages of your primary node.  If your primary went down, or for whatever reason cannot save data to Cassandra, then you won't even feel it if the ringbuffers can "clear the gap" between in memory data and older data in cassandra. So we advise to think about how fast your organisation could resolve a potential primary outage, and then set your parameters such that `(numchunks-1) * chunkspan` is more than that.
+* when the data stream moves into a new chunk, secondary nodes would drop the previous chunk and query Cassandra. But the primary needs some time to save the chunk to Cassandra.  Based on your deployment this could take anywhere between milliseconds or many minutes. Possibly even an hour or more.  As you don't want to slam Cassandra with requests at each chunk clear, you should probably use a numchunks of 2, or a numchunks that lets you retain data in memory for however long it takes to flush data to cassandra. (though the chunk cache alleviates this concern for hot data, see above).
+* The ringbuffers can be useful to let you deal with crashes or outages of your primary node.  If your primary went down, or for whatever reason cannot save data to Cassandra, then you won't even feel it if the ringbuffers can "clear the gap" between in memory data and older data in cassandra. So we advise to think about how fast your organisation could resolve a potential primary outage, and then set your parameters such that `(numchunks-1) * chunkspan` is more than that. (again, with a sufficiently large cache, this is only a concern for cold data)
 
 #### Rollups remove the need to keep large number of higher resolution chunks
 
diff --git a/metrictank-sample.ini b/metrictank-sample.ini
@@ -12,12 +12,14 @@ accounting-period = 5min
 ## data ##
 
 # see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
+
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
 chunkspan = 10min
 # number of raw chunks to keep in in-memory ring buffer
-# note that the chunk-cache (settings further down) is a more effective method to cache data and alleviate workload for cassandra.
-# but this allows secondary nodes to keep serving data in case the primary is not able to save data upto chunkspan*numchunks")
-numchunks = 5
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
+numchunks = 7
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
 
@@ -36,11 +38,13 @@ warm-up-period = 1h
 # settings for rollups (aggregation for archives)
 # comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
 # When running a cluster of metrictank instances, all instances should have the same agg-settings.
-# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 
 ## metric data storage in cassandra ##
diff --git a/metrictank.go b/metrictank.go
@@ -58,7 +58,7 @@ var (
 
 	// Data:
 	chunkSpanStr = flag.String("chunkspan", "10min", "duration of raw chunks")
-	numChunksInt = flag.Int("numchunks", 5, "number of raw chunks to keep in in-memory ring buffer. note that the chunk-cache is a more effective method to cache data and alleviate workload for cassandra. but this allows secondary nodes to keep serving data in case the primary is not able to save data upto chunkspan*numchunks")
+	numChunksInt = flag.Int("numchunks", 7, "number of raw chunks to keep in in-memory ring buffer. See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache")
 	ttlStr       = flag.String("ttl", "35d", "minimum wait before metrics are removed from storage")
 
 	chunkMaxStaleStr  = flag.String("chunk-max-stale", "1h", "max age for a chunk before to be considered stale and to be persisted to Cassandra.")
diff --git a/scripts/config/metrictank-docker.ini b/scripts/config/metrictank-docker.ini
@@ -9,12 +9,14 @@ accounting-period = 5min
 ## data ##
 
 # see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for more details
+
 # duration of raw chunks. e.g. 10min, 30min, 1h, 90min...
+# must be valid value as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
 chunkspan = 10min
 # number of raw chunks to keep in in-memory ring buffer
-# note that the chunk-cache (settings further down) is a more effective method to cache data and alleviate workload for cassandra.
-# but this allows secondary nodes to keep serving data in case the primary is not able to save data upto chunkspan*numchunks")
-numchunks = 5
+# See https://github.com/raintank/metrictank/blob/master/docs/memory-server.md for details and trade-offs, especially when compared to chunk-cache
+# (settings further down) which may be a more effective method to cache data and alleviate workload for cassandra.
+numchunks = 7
 # minimum wait before raw metrics are removed from storage
 ttl = 35d
 
@@ -33,11 +35,13 @@ warm-up-period = 1h
 # settings for rollups (aggregation for archives)
 # comma-separated list of archive specifications.
 # archive specification is of the form: aggSpan:chunkSpan:numChunks:TTL[:ready as bool. default true]
-# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false
-# 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
-# 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
+# with these aggregation rules: 5min:1h:2:3mon,1h:6h:2:1y:false you get:
+# - 5 min of data, store in a chunk that lasts 1hour, keep 2 chunks in in-memory ring buffer, keep for 3months in cassandra
+# - 1hr worth of data, in chunks of 6 hours, 2 chunks in in-memory ring buffer, keep for 1 year, but this series is not ready yet for querying.
 # When running a cluster of metrictank instances, all instances should have the same agg-settings.
-# chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# Note:
+# * chunk spans must be valid values as described here https://github.com/raintank/metrictank/blob/master/docs/memory-server.md#valid-chunk-spans
+# * numchunks -like the global setting- has nuanced use compared to chunk cache. see https://github.com/raintank/metrictank/blob/master/docs/memory-server.md
 agg-settings =
 
 ## metric data storage in cassandra ##
diff --git a/scripts/config/metrictank-package.ini b/scripts/config/metrictank-package.ini