Skip to content
This repository was archived by the owner on Aug 23, 2023. It is now read-only.

Commit f4f1746

Browse files
authored
Merge pull request #343 from raintank/cassandra-instrumentation
instrument cassandra errors
2 parents 0b00b1f + b68b58a commit f4f1746

File tree

5 files changed

+239
-16
lines changed

5 files changed

+239
-16
lines changed

cassandra/metrics.go

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package cassandra
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/gocql/gocql"
7+
"github.com/raintank/met"
8+
)
9+
10+
type Metrics struct {
11+
cassErrTimeout met.Count
12+
cassErrTooManyTimeouts met.Count
13+
cassErrConnClosed met.Count
14+
cassErrNoConns met.Count
15+
cassErrUnavailable met.Count
16+
cassErrOther met.Count
17+
}
18+
19+
func NewMetrics(component string, stats met.Backend) Metrics {
20+
return Metrics{
21+
cassErrTimeout: stats.NewCount(fmt.Sprintf("%s.error.timeout", component)),
22+
cassErrTooManyTimeouts: stats.NewCount(fmt.Sprintf("%s.error.too-many-timeouts", component)),
23+
cassErrConnClosed: stats.NewCount(fmt.Sprintf("%s.error.conn-closed", component)),
24+
cassErrNoConns: stats.NewCount(fmt.Sprintf("%s.error.no-connections", component)),
25+
cassErrUnavailable: stats.NewCount(fmt.Sprintf("%s.error.unavailable", component)),
26+
cassErrOther: stats.NewCount(fmt.Sprintf("%s.error.other", component)),
27+
}
28+
}
29+
30+
func (m *Metrics) Inc(err error) {
31+
if err == gocql.ErrTimeoutNoResponse {
32+
m.cassErrTimeout.Inc(1)
33+
} else if err == gocql.ErrTooManyTimeouts {
34+
m.cassErrTooManyTimeouts.Inc(1)
35+
} else if err == gocql.ErrConnectionClosed {
36+
m.cassErrConnClosed.Inc(1)
37+
} else if err == gocql.ErrNoConnections {
38+
m.cassErrNoConns.Inc(1)
39+
} else if err == gocql.ErrUnavailable {
40+
m.cassErrUnavailable.Inc(1)
41+
} else {
42+
m.cassErrOther.Inc(1)
43+
}
44+
}

dashboard.json

+175-13
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"type": "grafana",
2121
"id": "grafana",
2222
"name": "Grafana",
23-
"version": "3.1.1"
23+
"version": "4.0.0-pre1"
2424
},
2525
{
2626
"type": "datasource",
@@ -1700,7 +1700,7 @@
17001700
"lines": false
17011701
}
17021702
],
1703-
"span": 6,
1703+
"span": 4,
17041704
"stack": false,
17051705
"steppedLine": false,
17061706
"targets": [
@@ -1759,6 +1759,87 @@
17591759
}
17601760
]
17611761
},
1762+
{
1763+
"title": "cassandra store errors",
1764+
"error": false,
1765+
"span": 4,
1766+
"editable": true,
1767+
"type": "graph",
1768+
"isNew": true,
1769+
"id": 23,
1770+
"targets": [
1771+
{
1772+
"refId": "A",
1773+
"target": "aliasByNode(stats.$environment.metrictank.$host.cassandra.error.*, 6)"
1774+
}
1775+
],
1776+
"datasource": "${DS_GRAPHITE}",
1777+
"renderer": "flot",
1778+
"yaxes": [
1779+
{
1780+
"label": null,
1781+
"show": true,
1782+
"logBase": 1,
1783+
"min": null,
1784+
"max": null,
1785+
"format": "short"
1786+
},
1787+
{
1788+
"label": null,
1789+
"show": true,
1790+
"logBase": 1,
1791+
"min": null,
1792+
"max": null,
1793+
"format": "short"
1794+
}
1795+
],
1796+
"xaxis": {
1797+
"show": true
1798+
},
1799+
"grid": {
1800+
"threshold1": null,
1801+
"threshold2": null,
1802+
"threshold1Color": "rgba(216, 200, 27, 0.27)",
1803+
"threshold2Color": "rgba(234, 112, 112, 0.22)"
1804+
},
1805+
"lines": false,
1806+
"fill": 1,
1807+
"linewidth": 2,
1808+
"points": true,
1809+
"pointradius": 1,
1810+
"bars": false,
1811+
"stack": false,
1812+
"percentage": false,
1813+
"legend": {
1814+
"show": true,
1815+
"values": false,
1816+
"min": false,
1817+
"max": false,
1818+
"current": false,
1819+
"total": false,
1820+
"avg": false
1821+
},
1822+
"nullPointMode": "connected",
1823+
"steppedLine": false,
1824+
"tooltip": {
1825+
"value_type": "cumulative",
1826+
"shared": true,
1827+
"sort": 0,
1828+
"msResolution": false
1829+
},
1830+
"timeFrom": null,
1831+
"timeShift": null,
1832+
"aliasColors": {
1833+
"unavailable": "#890F02",
1834+
"too-many-timeouts": "#BA43A9",
1835+
"timeout": "#E24D42",
1836+
"no-connections": "#99440A",
1837+
"conn-closed": "#58140C",
1838+
"other": "#F2C96D"
1839+
},
1840+
"seriesOverrides": [],
1841+
"links": []
1842+
},
17621843
{
17631844
"aliasColors": {
17641845
"create/s": "#70DBED",
@@ -1795,7 +1876,7 @@
17951876
"points": true,
17961877
"renderer": "flot",
17971878
"seriesOverrides": [],
1798-
"span": 6,
1879+
"span": 4,
17991880
"stack": false,
18001881
"steppedLine": false,
18011882
"targets": [
@@ -1858,23 +1939,24 @@
18581939
"aliasColors": {
18591940
"add-to-bulk-index-fail": "#BF1B00",
18601941
"add-to-bulk-index-ok": "#3F6833",
1942+
"cassandra.fail": "#E24D42",
1943+
"cassandra.ok": "#3F6833",
1944+
"elasticsearch.fail": "#E0752D",
1945+
"elasticsearch.ok": "#3F6833",
18611946
"indexed-fail": "#E0752D",
18621947
"indexed-ok": "#629E51",
18631948
"mean": "#052B51",
1949+
"memory.fail": "#BF1B00",
1950+
"memory.ok": "#7EB26D",
18641951
"metrics_to_es.fail": "#BF1B00",
18651952
"metrics_to_es.ok": "#629E51",
18661953
"upper": "#DEDAF7",
18671954
"upper_75": "#0A50A1",
1868-
"upper_90": "#806EB7",
1869-
"cassandra.fail": "#E24D42",
1870-
"cassandra.ok": "#3F6833",
1871-
"memory.fail": "#BF1B00",
1872-
"memory.ok": "#7EB26D",
1873-
"elasticsearch.fail": "#E0752D",
1874-
"elasticsearch.ok": "#3F6833"
1955+
"upper_90": "#806EB7"
18751956
},
18761957
"bars": false,
18771958
"datasource": "${DS_GRAPHITE}",
1959+
"decimals": 0,
18781960
"editable": true,
18791961
"error": false,
18801962
"fill": 0,
@@ -1919,7 +2001,7 @@
19192001
"points": false
19202002
}
19212003
],
1922-
"span": 6,
2004+
"span": 4,
19232005
"stack": false,
19242006
"steppedLine": false,
19252007
"targets": [
@@ -1964,8 +2046,88 @@
19642046
"min": 0,
19652047
"show": true
19662048
}
2049+
]
2050+
},
2051+
{
2052+
"title": "cassandra idx errors",
2053+
"error": false,
2054+
"span": 4,
2055+
"editable": true,
2056+
"type": "graph",
2057+
"isNew": true,
2058+
"id": 22,
2059+
"targets": [
2060+
{
2061+
"refId": "A",
2062+
"target": "aliasByNode(stats.$environment.metrictank.$host.idx.cassandra.error.*, 7)"
2063+
}
19672064
],
1968-
"decimals": 0
2065+
"datasource": "${DS_GRAPHITE}",
2066+
"renderer": "flot",
2067+
"yaxes": [
2068+
{
2069+
"label": null,
2070+
"show": true,
2071+
"logBase": 1,
2072+
"min": null,
2073+
"max": null,
2074+
"format": "short"
2075+
},
2076+
{
2077+
"label": null,
2078+
"show": true,
2079+
"logBase": 1,
2080+
"min": null,
2081+
"max": null,
2082+
"format": "short"
2083+
}
2084+
],
2085+
"xaxis": {
2086+
"show": true
2087+
},
2088+
"grid": {
2089+
"threshold1": null,
2090+
"threshold2": null,
2091+
"threshold1Color": "rgba(216, 200, 27, 0.27)",
2092+
"threshold2Color": "rgba(234, 112, 112, 0.22)"
2093+
},
2094+
"lines": false,
2095+
"fill": 1,
2096+
"linewidth": 2,
2097+
"points": true,
2098+
"pointradius": 1,
2099+
"bars": false,
2100+
"stack": false,
2101+
"percentage": false,
2102+
"legend": {
2103+
"show": true,
2104+
"values": false,
2105+
"min": false,
2106+
"max": false,
2107+
"current": false,
2108+
"total": false,
2109+
"avg": false
2110+
},
2111+
"nullPointMode": "connected",
2112+
"steppedLine": false,
2113+
"tooltip": {
2114+
"value_type": "cumulative",
2115+
"shared": true,
2116+
"sort": 0,
2117+
"msResolution": false
2118+
},
2119+
"timeFrom": null,
2120+
"timeShift": null,
2121+
"aliasColors": {
2122+
"unavailable": "#890F02",
2123+
"too-many-timeouts": "#BA43A9",
2124+
"timeout": "#E24D42",
2125+
"no-connections": "#99440A",
2126+
"conn-closed": "#58140C",
2127+
"other": "#F2C96D"
2128+
},
2129+
"seriesOverrides": [],
2130+
"links": []
19692131
},
19702132
{
19712133
"aliasColors": {
@@ -2016,7 +2178,7 @@
20162178
"points": false
20172179
}
20182180
],
2019-
"span": 6,
2181+
"span": 4,
20202182
"stack": false,
20212183
"steppedLine": false,
20222184
"targets": [

docs/metrics.md

+4
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ and it was able to save its complete chunks, this node will be able to take over
2424
You can upgrade a candidate to primary while the timer is not 0 yet, it just means it may have missing data in the chunks that it will save.
2525
* `gc_metric`:
2626
the amount of times the metrics GC is about to inspect a metric (series)
27+
* `idx.cassadra.ok`:
28+
how many metrics are successfully being indexed
29+
* `idx.cassandra.fail`:
30+
how failures encountered while trying to index metrics
2731
* `metrics_active`:
2832
the amount of currently known metrics (excl rollup series), measured every second
2933
* `metrics_too_old`:

idx/cassandra/cassandra.go

+8-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010

1111
"github.com/gocql/gocql"
1212
"github.com/raintank/met"
13+
"github.com/raintank/metrictank/cassandra"
1314
"github.com/raintank/metrictank/idx"
1415
"github.com/raintank/metrictank/idx/memory"
1516
"github.com/raintank/worldping-api/pkg/log"
@@ -25,10 +26,11 @@ const table_schema = `CREATE TABLE IF NOT EXISTS %s.metric_def_idx (
2526
AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}`
2627

2728
var (
28-
idxCasOk met.Count
29-
idxCasFail met.Count
29+
idxCasOk met.Count // metric idx.cassadra.ok is how many metrics are successfully being indexed
30+
idxCasFail met.Count // metric idx.cassandra.fail is how failures encountered while trying to index metrics
3031
idxCasAddDuration met.Timer
3132
idxCasDeleteDuration met.Timer
33+
metrics cassandra.Metrics
3234

3335
Enabled bool
3436
keyspace string
@@ -129,6 +131,7 @@ func (c *CasIdx) Init(stats met.Backend) error {
129131
idxCasFail = stats.NewCount("idx.cassandra.fail")
130132
idxCasAddDuration = stats.NewTimer("idx.cassandra.add_duration", 0)
131133
idxCasDeleteDuration = stats.NewTimer("idx.cassandra.delete_duration", 0)
134+
metrics = cassandra.NewMetrics("idx.cassandra", stats)
132135

133136
for i := 0; i < numConns; i++ {
134137
c.wg.Add(1)
@@ -219,6 +222,7 @@ func (c *CasIdx) processWriteQueue() {
219222
for !success {
220223
if err := c.session.Query(`INSERT INTO metric_def_idx (id, def) VALUES (?, ?)`, req.def.Id, data).Exec(); err != nil {
221224
idxCasFail.Inc(1)
225+
metrics.Inc(err)
222226
if (attempts % 20) == 0 {
223227
log.Warn("cassandra-idx Failed to write def to cassandra. it will be retried. %s", err)
224228
}
@@ -252,6 +256,7 @@ func (c *CasIdx) Delete(orgId int, pattern string) ([]schema.MetricDefinition, e
252256
attempts++
253257
cErr := c.session.Query("DELETE FROM metric_def_idx where id=?", def.Id).Exec()
254258
if cErr != nil {
259+
metrics.Inc(err)
255260
log.Error(3, "cassandra-idx Failed to delete metricDef %s from cassandra. %s", def.Id, err)
256261
time.Sleep(time.Second)
257262
} else {
@@ -274,6 +279,7 @@ func (c *CasIdx) Prune(orgId int, oldest time.Time) ([]schema.MetricDefinition,
274279
attempts++
275280
cErr := c.session.Query("DELETE FROM metric_def_idx where id=?", def.Id).Exec()
276281
if cErr != nil {
282+
metrics.Inc(err)
277283
log.Error(3, "cassandra-idx Failed to delete metricDef %s from cassandra. %s", def.Id, err)
278284
time.Sleep(time.Second)
279285
} else {

0 commit comments

Comments
 (0)