Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metrics Tags and Redis #81

Merged
merged 10 commits into from
Jun 19, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ install:
script: nosetests tests
before_script:
- sudo apt-get install sysstat

env:
- DB=redis

106 changes: 69 additions & 37 deletions checks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""
Datadog agent
"""Base class for Checks.

If you are writing your own checks you should subclass the Check class.

The typicall workflow works like this:
1. Create your Check class
2. Declare your metrics as gauges or counters
3. Call save_sample for each metric
4. Call get_metrics() to get results
5. Plug the results into checks/common.py

Licensed under Simplified BSD License (see LICENSE)
(C) Boxed Ice 2010 all rights reserved
(C) Datadog, Inc 2011 All Rights Reserved
"""

import logging
Expand Down Expand Up @@ -53,12 +58,17 @@ def filter(self, record):
class Check(object):
"""
(Abstract) class for all checks with the ability to:
* store 1 (and only 1) sample for gauges per metric/tag combination
* compute rates for counters
* only log error messages once (instead of each time they occur)

"""
def __init__(self, logger):
# where to store samples, indexed by metric_name
# metric_name: [(ts, value), (ts, value)]
# metric_name: {("sorted", "tags"): [(ts, value), (ts, value)],
# tuple(tags) are stored as a key since lists are not hashable
# None: [(ts, value), (ts, value)]}
# untagged values are indexed by None
self._sample_store = {}
self._counters = {} # metric_name: bool
self.logger = logger
Expand Down Expand Up @@ -92,7 +102,7 @@ def counter(self, metric):
ACHTUNG: Resets previous values associated with this metric.
"""
self._counters[metric] = True
self._sample_store[metric] = []
self._sample_store[metric] = {}

def is_counter(self, metric):
"Is this metric a counter?"
Expand All @@ -103,7 +113,7 @@ def gauge(self, metric):
Treats the metric as a gauge, i.e. keep the data as is
ACHTUNG: Resets previous values associated with this metric.
"""
self._sample_store[metric] = []
self._sample_store[metric] = {}

def is_metric(self, metric):
return metric in self._sample_store
Expand All @@ -116,32 +126,47 @@ def get_metric_names(self):
"Get all metric names"
return self._sample_store.keys()

def save_sample(self, metric, value, timestamp=None):
"""Save a simple sample, evict old values if needed"""
def save_gauge(self, metric, value, timestamp=None, tags=None):
""" Save a gauge value. """
if not self.is_gauge(metric):
self.gauge(metric)
self.save_sample(metric, value, timestamp, tags)

def save_sample(self, metric, value, timestamp=None, tags=None):
"""Save a simple sample, evict old values if needed
"""
if timestamp is None:
timestamp = time.time()
if metric not in self._sample_store:
raise CheckException("Saving a sample for an undefined metric %s" % metric)
raise CheckException("Saving a sample for an undefined metric: %s" % metric)
try:
value = float(value)
except ValueError, ve:
raise NaN(ve)

# Sort and validate tags
if tags is not None:
if type(tags) not in [type([]), type(())]:
raise CheckException("Tags must be a list or tuple of strings")
else:
tags = tuple(sorted(tags))

# Data eviction rules
if self.is_gauge(metric):
self._sample_store[metric] = [(timestamp, value)]
self._sample_store[metric][tags] = ((timestamp, value), )
elif self.is_counter(metric):
if len(self._sample_store[metric]) == 0:
self._sample_store[metric] = [(timestamp, value)]
if self._sample_store[metric].get(tags) is None:
self._sample_store[metric][tags] = [(timestamp, value)]
else:
self._sample_store[metric] = self._sample_store[metric][-1:] + [(timestamp, value)]
self._sample_store[metric][tags] = self._sample_store[metric][tags][-1:] + [(timestamp, value)]
else:
raise CheckException("%s must be either gauge or counter, skipping sample at %s" % (metric, time.ctime(timestamp)))

if self.is_gauge(metric):
assert len(self._sample_store[metric]) in (0, 1), self._sample_store[metric]
# store[metric][tags] = (ts, val) - only 1 value allowd
assert len(self._sample_store[metric][tags]) == 1, self._sample_store[metric]
elif self.is_counter(metric):
assert len(self._sample_store[metric]) in (0, 1, 2), self._sample_store[metric]
assert len(self._sample_store[metric][tags]) in (1, 2), self._sample_store[metric]

@classmethod
def _rate(cls, sample1, sample2):
Expand All @@ -163,33 +188,39 @@ def _rate(cls, sample1, sample2):
except Exception, e:
raise NaN(e)

def get_sample_with_timestamp(self, metric):
def get_sample_with_timestamp(self, metric, tags=None):
"Get (timestamp-epoch-style, value)"

# Get the proper tags
if tags is not None and type(tags) == type([]):
tags.sort()
tags = tuple(tags)

# Never seen this metric
if metric not in self._sample_store:
raise UnknownValue()

# Not enough value to compute rate
elif self.is_counter(metric) and len(self._sample_store[metric]) < 2:
elif self.is_counter(metric) and len(self._sample_store[metric][tags]) < 2:
raise UnknownValue()

elif self.is_counter(metric) and len(self._sample_store[metric]) >= 2:
return self._rate(self._sample_store[metric][-2], self._sample_store[metric][-1])
elif self.is_counter(metric) and len(self._sample_store[metric][tags]) >= 2:
return self._rate(self._sample_store[metric][tags][-2], self._sample_store[metric][tags][-1])

elif self.is_gauge(metric) and len(self._sample_store[metric]) >= 1:
return self._sample_store[metric][-1]
elif self.is_gauge(metric) and len(self._sample_store[metric][tags]) >= 1:
return self._sample_store[metric][tags][-1]

else:
raise UnknownValue()

def get_sample(self, metric):
def get_sample(self, metric, tags=None):
"Return the last value for that metric"
x = self.get_sample_with_timestamp(metric)
x = self.get_sample_with_timestamp(metric, tags)
assert type(x) == types.TupleType and len(x) == 2, x
return x[1]

def get_samples_with_timestamps(self):
"Return all values {metric: (ts, value)}"
"Return all values {metric: (ts, value)} for non-tagged metrics"
values = {}
for m in self._sample_store:
try:
Expand All @@ -199,7 +230,7 @@ def get_samples_with_timestamps(self):
return values

def get_samples(self):
"Return all values {metric: value}"
"Return all values {metric: value} for non-tagged metrics"
values = {}
for m in self._sample_store:
try:
Expand All @@ -209,21 +240,22 @@ def get_samples(self):
pass
return values

def get_metadata(self):
"""Return a dictionary of key-value pairs with metadata
How these metadata are interpreted and processed is not defined here
"""
return {}

def get_metrics(self):
"""This is the new format to send metrics backs
"""Get all metrics, including the ones that are tagged.
This is the preferred method to retrieve metrics

@return the list of samples
@rtype [(metric_name, timestamp, value, {"tags": ["tag1", "tag2"]}), ...]
"""
metrics = []
for m in self._sample_store:
try:
ts, val = self.get_sample_with_timestamp(m)
# FIXME alq - no metadata yet
metrics.append((m, int(ts), val, {}))
for t in self._sample_store[m]:
ts, val = self.get_sample_with_timestamp(m, t)
if t is None:
metrics.append((m, int(ts), val, {}))
else:
metrics.append((m, int(ts), val, {"tags": list(t)}))
except:
pass
return metrics
Expand Down
15 changes: 6 additions & 9 deletions checks/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,12 @@ def __init__(self, agentConfig, emitter):
self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig)
self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig)

self._metrics_checks = [Cacti(self.checksLogger), Varnish(self.checksLogger)]
# All new checks should be metrics checks:
self._metrics_checks = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, I didn't know we were starting to move to this pattern. That means we're in a position to implement conf.d-style custom checks.

Cacti(self.checksLogger),
Redis(self.checksLogger),
Varnish(self.checksLogger)
]
self._event_checks = [Hudson(), Nagios(socket.gethostname())]
self._resources_checks = [ResProcesses(self.checksLogger,self.agentConfig)]

Expand Down Expand Up @@ -177,10 +182,6 @@ def getGangliaData(self):
def getCassandraData(self):
return self._cassandra.check(self.checksLogger, self.agentConfig)

@recordsize
def getRedisData(self):
return self._redis.check(self.agentConfig)

@recordsize
def getJvmData(self):
return self._jvm.check(self.agentConfig)
Expand Down Expand Up @@ -234,7 +235,6 @@ def doChecks(self, firstRun=False, systemStats=False):
cpuStats = self.getCPUStats()
gangliaData = self.getGangliaData()
cassandraData = self.getCassandraData()
redisData = self.getRedisData()
jvmData = self.getJvmData()
tomcatData = self.getTomcatData()
activeMQData = self.getActiveMQData()
Expand Down Expand Up @@ -315,9 +315,6 @@ def doChecks(self, firstRun=False, systemStats=False):
if ioStats:
checksData['ioStats'] = ioStats

if redisData:
checksData['redis'] = redisData

if jvmData:
checksData['jvm'] = jvmData

Expand Down
Loading