flyte
11/25/2013 - 2:58 PM

Nagios plugin to check CouchDB replication status

Nagios plugin to check CouchDB replication status

#!/usr/bin/python

import json, urllib2, sys, argparse

OK = 0
WARNING = 1
CRITICAL = 2
UNKNOWN = 3

p = argparse.ArgumentParser()
p.add_argument("-n", "--number", default=2, type=int, help="Number of replication jobs there should be")

def get_json_from_url(url):
  req = urllib2.Request(url)
  res = urllib2.urlopen(req)
  return json.loads(res.read())


def main():
  args = p.parse_args()

  jobs = get_json_from_url("http://localhost:5984/_active_tasks")

  # Check that the replication jobs exist
  if len(jobs) == 0:
    print "There are no replication jobs running"
    sys.exit(CRITICAL)
    
  job_names = []
  for job in jobs:
    try:
      job_status = get_json_from_url("http://localhost:5984/_replicator/%s" % job["doc_id"])
    except urllib2.HTTPError, e:
      print "HTTP Error when getting info for job '%s'. Maybe the job doesn't exist?" % job["doc_id"]
      sys.exit(CRITICAL)
    
    # Check if job has errored
    if job_status["_replication_state"] != "triggered":
      print "Replication job '%s' is in state: %s" % (job["doc_id"], job_status["_replication_state"])
      sys.exit(CRITICAL)
    
    # Check for duplicate jobs
    if job["doc_id"] in job_names:
      print "More than one replication job with the name '%s'" % job["doc_id"]
      sys.exit(WARNING)
    
    job_names.append(job["doc_id"])
    
  amt_jobs = len(jobs)
  if amt_jobs != args.number:
    print "Amount of jobs (%d) different to expected amount (%d)" % (amt_jobs, args.number)
    sys.exit(CRITICAL)
  else:
    print "%d replication job%s currently running OK" % (len(jobs), "s"[0:len(jobs) != 1])
    sys.exit(OK)


if __name__ == "__main__":
  main()