File: //proc/self/root/lib/fm-agent/plugins/cpu_usage.py
import agent_util
import time
import sys
import platform
import os
import socket
from agent_util import float
try:
import psutil
except:
psutil = None
try:
import distro
except:
distro = None
def search_esxtop(headers, search_string):
for idx, column in enumerate(headers):
if search_string in column:
return idx
return None
def get_cpu_metrics(cls):
retcode, output = agent_util.execute_command("cat /proc/stat")
cls.log.debug("cat /proc/stat output: %s" % str(output))
output = output.splitlines()
stat_fields = [
"user",
"nice",
"system",
"idle",
"iowait",
"irq",
"softirq",
"steal",
"guest",
"guest_nice",
]
cpus = {}
for line in output:
if not line.startswith("cpu"):
continue
# python3 compatible lambda function
if sys.version_info[0] == 3:
parts = list(filter(lambda p: p, line.split(" ")))
else:
parts = filter(lambda p: p, line.split(" "))
core = parts[0]
if core == "cpu":
core = "Total"
if len(parts) >= 11:
user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice = (
map(int, parts[1:11])
)
cpus[core] = {
"user": user,
"nice": nice,
"system": system,
"idle": idle,
"iowait": iowait,
"irq": irq,
"softirq": softirq,
"steal": steal,
"guest": guest,
"guest_nice": guest_nice,
}
elif len(parts) > 8 and len(parts) < 11:
user, nice, system, idle, iowait, irq, softirq = map(int, parts[1:8])
cpus[core] = {
"user": user,
"nice": nice,
"system": system,
"idle": idle,
"iowait": iowait,
"irq": irq,
"softirq": softirq,
}
return cpus
class CPUUsagePlugin(agent_util.Plugin):
textkey = "cpu_usage"
label = "CPU"
@classmethod
def get_metadata(self, config):
status = agent_util.SUPPORTED
msg = None
if "aix" in sys.platform:
status = agent_util.SUPPORTED
data = {
"load_average.1": {
"label": "1 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.5": {
"label": "5 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.15": {
"label": "15 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"usage_percentage": {
"label": "Usage percentage",
"options": sorted(get_cpu_metrics(self).keys()),
"status": status,
"error_message": msg,
"unit": "percent",
},
"user_usage_percentage": {
"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"system_usage_percentage": {
"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"idle_usage_percentage": {
"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"iowait_usage_percentage": {
"label": "I/O Wait usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"cpu_entitlement_percentage": {
"label": "CPU entitlement percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
}
return data
elif "sunos" in sys.platform:
status = agent_util.SUPPORTED
data = {
"load_average.1": {
"label": "1 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.5": {
"label": "5 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.15": {
"label": "15 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"usage_percentage": {
"label": "Usage percentage",
"options": sorted(get_cpu_metrics(self).keys()),
"status": status,
"error_message": msg,
"unit": "percent",
},
"user_usage_percentage": {
"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"system_usage_percentage": {
"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"idle_usage_percentage": {
"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"iowait_usage_percentage": {
"label": "I/O Wait usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
}
return data
elif "freebsd" in sys.platform or "darwin" in sys.platform:
status = agent_util.SUPPORTED
data = {
"load_average.1": {
"label": "1 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.5": {
"label": "5 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.15": {
"label": "15 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"usage_percentage": {
"label": "Usage percentage",
"options": ["Total"],
"status": status,
"error_message": msg,
"unit": "percent",
},
"user_usage_percentage": {
"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"system_usage_percentage": {
"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"idle_usage_percentage": {
"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
}
return data
elif "hp-ux" in sys.platform:
status = agent_util.SUPPORTED
metadata = {
"load_average.1": {
"label": "1 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.5": {
"label": "5 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.15": {
"label": "15 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"usage_percentage": {
"label": "Total Usage percentage",
"options": ["Total"],
"status": status,
"error_message": msg,
"unit": "percent",
},
"user_usage_percentage": {
"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"system_usage_percentage": {
"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"idle_usage_percentage": {
"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
}
return metadata
elif "vmware" in sys.platform:
status = agent_util.SUPPORTED
# here we're gathering the CPU cores that we can monitor and adding in a Total aggregation
cpus = []
ret, out = agent_util.execute_command(
'esxcli hardware cpu list | grep "CPU:"'
)
tmp_cpus = [x for x in out.split("\n") if x != ""]
for c in tmp_cpus:
cpu = "Cpu (%s)" % c.split(":")[1]
cpus.append(cpu)
cpus.append("Total")
data = {
"load_average.1": {
"label": "1 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.5": {
"label": "5 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.15": {
"label": "15 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"usage_percentage": {
"label": "Usage percentage",
"options": cpus,
"status": status,
"error_message": msg,
"min_value": 0,
"max_value": 100,
"unit": "percent",
},
"idle_usage_percentage": {
"label": "Idle usage percentage",
"options": cpus,
"status": status,
"error_message": msg,
"min_value": 0,
"max_value": 100,
"unit": "percent",
},
}
return data
else:
if psutil is None:
# Unable to import psutil
self.log.info(
"Unable to import psutil library, no process metrics available"
)
status = agent_util.UNSUPPORTED
msg = "Unable to import psutil library, please install and rebuild metadata"
# Core Linux
if not agent_util.which("top", exc=False):
self.log.info("top binary not found")
status = agent_util.UNSUPPORTED
msg = "top binary not found"
try:
distro_info = platform.dist()
except AttributeError:
if distro:
distro_info = distro.linux_distribution()
distro_info = ". ".join(distro_info)
else:
raise ValueError(
"Unable to grab distribution information. Please verify dependencies. Distro for Python3.8"
)
if (
"centos" in distro_info
or "redhat" in distro_info
or "oracle" in distro_info
) and not agent_util.which("iostat", exc=False):
self.log.info("Missing sysstat package.")
status = agent_util.UNSUPPORTED
msg = "iostat/sysstat binary not found. Please install"
metadata = {
"load_average.1": {
"label": "1 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.5": {
"label": "5 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"load_average.15": {
"label": "15 minute CPU load average",
"options": None,
"status": status,
"error_message": msg,
"unit": "avg",
},
"usage_percentage": {
"label": "Usage percentage",
"options": sorted(get_cpu_metrics(self).keys()),
"status": status,
"error_message": msg,
"unit": "percent",
},
"user_usage_percentage": {
"label": "User usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"system_usage_percentage": {
"label": "System usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"idle_usage_percentage": {
"label": "Idle usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"iowait_usage_percentage": {
"label": "I/O Wait usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"irq_usage_percentage": {
"label": "Hardware IRQ usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"softirg_usage_percentage": {
"label": "Software IRQ usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"stealtime_usage_percentage": {
"label": "Steal Time usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
"nice_usage_percentage": {
"label": "Nice usage percentage",
"options": None,
"status": status,
"error_message": msg,
"unit": "percent",
},
}
return metadata
def check(self, textkey, data, config={}):
# AIX-specific logic
if (
"aix" in sys.platform
or "darwin" in sys.platform
or "freebsd" in sys.platform
):
if textkey.startswith("load_average"):
retcode, load = agent_util.execute_command("uptime")
fields = load.strip().split()
if textkey == "load_average.1":
return float(fields[-3].strip(","))
elif textkey == "load_average.5":
return float(fields[-2].strip(","))
elif textkey == "load_average.15":
return float(fields[-1])
else:
return None
else:
iostat = str(agent_util.which("iostat"))
if "aix" in sys.platform:
retcode, output = agent_util.execute_command(
iostat + " | grep -p tty"
)
if "darwin" in sys.platform or "freebsd" in sys.platform:
retcode, output = agent_util.execute_command(
iostat + " -C -c 2 | tail -1"
)
output = output.strip().split("\n")
self.log.debug("iostat output: %s" % output)
iostatline = False
enti = False
entc = 0
inuse = 0
user = 0
system = 0
idle = 0
iowait = 0
for line in output:
if line.startswith("tty"):
iostatline = True
if "entc" in line.split()[-1]:
enti = True
continue
fields = line.split()
if "darwin" in sys.platform:
user = float(fields[-6])
system = float(fields[-5])
idle = float(fields[-4])
elif "freebsd" in sys.platform:
user = float(-5)
idle = float(fields[-1])
system = float(fields[-3])
else:
user = float(fields[2])
system = float(fields[3])
idle = float(fields[4])
iowait = float(fields[5])
if enti == True:
entc = float(fields[-1])
inuse = 100.0 - idle
if textkey == "usage_percentage":
return inuse
elif textkey == "user_usage_percentage":
return user
elif textkey == "system_usage_percentage":
return system
elif textkey == "idle_usage_percentage":
return idle
elif textkey == "iowait_usage_percentage":
return iowait
elif textkey == "cpu_entitlement_percentage" and enti == True:
return entc
# If we got here, we don't know how to gather this metric
# for AIX - return None
return None
elif "sunos" in sys.platform:
if textkey.startswith("load_average"):
retcode, load = agent_util.execute_command("uptime")
fields = load.strip().split()
if textkey == "load_average.1":
return float(fields[-3].strip(","))
elif textkey == "load_average.5":
return float(fields[-2].strip(","))
elif textkey == "load_average.15":
return float(fields[-1])
else:
return None
retcode, output = agent_util.execute_command("mpstat")
output = output.split("\n")
for line in output:
if "CPU" in line or not line:
continue
fields = line.split()
if textkey == "usage_percentage":
return 100.0 - float(fields[-1])
elif textkey == "user_usage_percentage":
return float(fields[-4])
elif textkey == "system_usage_percentage":
return float(fields[-3])
elif textkey == "idle_usage_percentage":
return float(fields[-1])
elif textkey == "iowait_usage_percentage":
return float(fields[-2])
# If we got here we don't know how to gather this metric for Solaris
return None
elif "vmware" in sys.platform:
hostname = socket.gethostname()
search_string = "\\\\%s\\Physical " % hostname
metric_value = None
# actually gather the data to parse
ret, out = agent_util.execute_command(
"esxtop -b -n 2 -d 2", cache_timeout=agent_util.DEFAULT_CACHE_TIMEOUT
)
out_list = out.split("\n")
headers = out_list[0].replace('"', "").split(",")
esxtop_data = []
for idx, val in enumerate(out_list[::1]):
if not val or val == "":
continue
esxtop_data = out_list[idx].replace('"', "").split(",")
# finish building search string
if textkey.startswith("load_average"):
search_string += (
"Cpu Load\\Cpu Load (%s Minute Avg)" % textkey.split(".")[-1]
)
elif data and (
textkey == "usage_percentage" or textkey == "idle_usage_percentage"
):
if data == "Total":
search_string += "Cpu(_Total)"
else:
search_string += data
search_string += "\\% Processor Time"
# find index from headers and match to esxtop_data collected
search_idx = search_esxtop(headers, search_string)
if not search_idx:
self.log.error("Unable to parse ESXTOP output for %s" % search_string)
return None
if textkey == "idle_usage_percentage":
metric_value = 100 - float(esxtop_data[search_idx])
else:
metric_value = float(esxtop_data[search_idx])
return metric_value
elif "hp-ux" in sys.platform:
# add terminal specification for hpux
os.environ["TERM"] = "xterm"
# !!! applicable to HP-UX 11.31 !!!
ret, out = agent_util.execute_command("top -s2 -d2", env=os.environ)
top = out.strip().splitlines()
self.log.debug(top)
metric_mapping = {}
cpu_str = ""
load_str = ""
for line in top:
if line.lower().startswith("avg"):
cpu_str = line
elif line.lower().startswith("load averages"):
load_str = line
cpu = cpu_str.replace("%", "").split()
self.log.debug(cpu)
metric_mapping["user_usage_percentage"] = float(cpu[2])
metric_mapping["system_usage_percentage"] = float(cpu[4])
metric_mapping["idle_usage_percentage"] = float(cpu[5])
metric_mapping["usage_percentage"] = (
100.0 - metric_mapping["idle_usage_percentage"]
)
load = load_str.strip().replace(",", "").split()
self.log.debug(load)
self.log.debug("'%s'" % load[4][:4])
metric_mapping["load_average.1"] = float(load[2])
metric_mapping["load_average.5"] = float(load[3])
metric_mapping["load_average.15"] = float(load[4][:4])
return float(metric_mapping.get(textkey, None))
else:
if psutil is None:
self.log.error("PSUTIL PACKAGE MISSING! UNABLE TO COLLECT CPU METRICS")
return None
# Default Linux/FreeBSD logic
if textkey.startswith("load_average"):
retcode, output = agent_util.execute_command("top -b -n 2 -d 0.5")
if config.get("debug", False):
self.log.debug(
"#####################################################"
)
self.log.debug("CPU usage command 'top -b -n 2 -d 0.5:")
self.log.debug(str(output))
self.log.debug(
"#####################################################"
)
self.log.debug("top -b -n 2 -d 0.5: %s" % str(output))
output = output.splitlines()
space_index = [0]
for var, item in enumerate(output):
if item == "":
space_index.append(var)
tmp_out = []
for line in output[space_index[2] :]:
if line.strip():
tmp_out.append(line)
output = tmp_out
if textkey.startswith("load_average"):
fields = output[0].split()
if textkey == "load_average.1":
index = -3
elif textkey == "load_average.5":
index = -2
elif textkey == "load_average.15":
index = -1
return float(fields[index].strip(","))
elif textkey.endswith("usage_percentage") and textkey != "usage_percentage":
num_cores = psutil.cpu_count()
usage_textkey_map = {
"user_usage_percentage": "user",
"system_usage_percentage": "system",
"idle_usage_percentage": "idle",
"iowait_usage_percentage": "iowait",
"irq_usage_percentage": "irq",
"softirg_usage_percentage": "softirq",
"stealtime_usage_percentage": "steal",
"nice_usage_percentage": "nice",
}
key_name = usage_textkey_map.get(textkey, None)
if key_name is None:
self.log.error("Unknown resource textkey '%s'!" % textkey)
return None
c = self.get_cache_results("psutil", "detailed_cpu_usage")
self.log.debug("Retrieved cached value:\n%s" % c)
cur_cpu = psutil.cpu_times()
self.log.debug(
"Retrieved instant value:\n%s" % getattr(cur_cpu, key_name)
)
last_cpu = c and c[0][1] or None
self.cache_result("psutil", "detailed_cpu_usage", cur_cpu, replace=True)
if last_cpu is None:
return None
use_diff = (
getattr(cur_cpu, key_name) - getattr(last_cpu, key_name)
) / num_cores
if use_diff < 0:
# The system was likely rebooted, and the cached
# CPU stats are no longer relevant.
# Cache new values and exit without reporting a value.
return None
elapsed = c[0][0]
usage_time = (use_diff / elapsed) * 100.0
return usage_time
elif textkey == "usage_percentage" and data.lower() == "total":
num_cores = psutil.cpu_count()
c = self.get_cache_results("psutil", "total_cpu_usage")
self.log.debug("Retrieved cached value:\n%s" % c)
cur_cpu = psutil.cpu_times()
self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle)
last_cpu = c and c[0][1] or None
self.cache_result("psutil", "total_cpu_usage", cur_cpu, replace=True)
if last_cpu is None:
return None
idle_diff = (cur_cpu.idle - last_cpu.idle) / num_cores
steal_diff = (cur_cpu.steal - last_cpu.steal) / num_cores
if idle_diff < 0 or steal_diff < 0:
# The system was likely rebooted, and the cached
# CPU stats are no longer relevant.
# Cache new values and exit without reporting a value.
return None
use_diff = idle_diff + steal_diff
# Instead of using the time between cached calculate the exact time
# between measures by substracting the sum of the current clock time vs
# the previous clock time. This avoid issues where our usage was too small
# and the seconds of the extra cache would give a negative result
elapsed = (sum(cur_cpu) - sum(last_cpu)) / float(num_cores)
usage_time = 100 - ((use_diff / elapsed) * 100.0)
return usage_time
elif textkey == "usage_percentage" and data.lower() != "total":
self.log.debug("Checking for core %s" % data)
num_cores = psutil.cpu_count()
c = self.get_cache_results("psutil", "%s_cpu_usage" % data)
self.log.debug("Retrieved cached value:\n%s" % c)
try:
cur_cpu = psutil.cpu_times(percpu=True)[int(str(data).strip("cpu"))]
except IndexError:
self.log.critical("UNABLE TO FIND CPU #%s" % data)
return None
self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle)
last_cpu = c and c[0][1] or None
self.cache_result(
"psutil", "%s_cpu_usage" % data, cur_cpu, replace=True
)
if last_cpu is None:
return None
idle_diff = cur_cpu.idle - last_cpu.idle
steal_diff = cur_cpu.steal - last_cpu.steal
if idle_diff < 0 or steal_diff < 0:
# The system was likely rebooted, and the cached
# CPU stats are no longer relevant.
# Cache new values and exit without reporting a value.
return None
use_diff = idle_diff + steal_diff
elapsed = sum(cur_cpu) - sum(last_cpu)
usage_time = 100 - ((use_diff / elapsed) * 100.0)
return usage_time
return 0