HEX

File: //proc/self/root/lib/fm-agent/plugins/cpu_usage.py
import agent_util
import time
import sys
import platform
import os
import socket
from agent_util import float

try:
    import psutil
except:
    psutil = None


try:
    import distro
except:
    distro = None


def search_esxtop(headers, search_string):
    for idx, column in enumerate(headers):
        if search_string in column:
            return idx
    return None


def get_cpu_metrics(cls):
    retcode, output = agent_util.execute_command("cat /proc/stat")
    cls.log.debug("cat /proc/stat output: %s" % str(output))
    output = output.splitlines()
    stat_fields = [
        "user",
        "nice",
        "system",
        "idle",
        "iowait",
        "irq",
        "softirq",
        "steal",
        "guest",
        "guest_nice",
    ]
    cpus = {}
    for line in output:
        if not line.startswith("cpu"):
            continue
        # python3 compatible lambda function
        if sys.version_info[0] == 3:
            parts = list(filter(lambda p: p, line.split(" ")))
        else:
            parts = filter(lambda p: p, line.split(" "))

        core = parts[0]
        if core == "cpu":
            core = "Total"
        if len(parts) >= 11:
            user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice = (
                map(int, parts[1:11])
            )
            cpus[core] = {
                "user": user,
                "nice": nice,
                "system": system,
                "idle": idle,
                "iowait": iowait,
                "irq": irq,
                "softirq": softirq,
                "steal": steal,
                "guest": guest,
                "guest_nice": guest_nice,
            }
        elif len(parts) > 8 and len(parts) < 11:
            user, nice, system, idle, iowait, irq, softirq = map(int, parts[1:8])
            cpus[core] = {
                "user": user,
                "nice": nice,
                "system": system,
                "idle": idle,
                "iowait": iowait,
                "irq": irq,
                "softirq": softirq,
            }
    return cpus


class CPUUsagePlugin(agent_util.Plugin):
    textkey = "cpu_usage"
    label = "CPU"

    @classmethod
    def get_metadata(self, config):
        status = agent_util.SUPPORTED
        msg = None

        if "aix" in sys.platform:
            status = agent_util.SUPPORTED
            data = {
                "load_average.1": {
                    "label": "1 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.5": {
                    "label": "5 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.15": {
                    "label": "15 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "usage_percentage": {
                    "label": "Usage percentage",
                    "options": sorted(get_cpu_metrics(self).keys()),
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "user_usage_percentage": {
                    "label": "User usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "system_usage_percentage": {
                    "label": "System usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "idle_usage_percentage": {
                    "label": "Idle usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "iowait_usage_percentage": {
                    "label": "I/O Wait usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "cpu_entitlement_percentage": {
                    "label": "CPU entitlement percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
            }
            return data

        elif "sunos" in sys.platform:
            status = agent_util.SUPPORTED
            data = {
                "load_average.1": {
                    "label": "1 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.5": {
                    "label": "5 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.15": {
                    "label": "15 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "usage_percentage": {
                    "label": "Usage percentage",
                    "options": sorted(get_cpu_metrics(self).keys()),
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "user_usage_percentage": {
                    "label": "User usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "system_usage_percentage": {
                    "label": "System usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "idle_usage_percentage": {
                    "label": "Idle usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "iowait_usage_percentage": {
                    "label": "I/O Wait usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
            }
            return data

        elif "freebsd" in sys.platform or "darwin" in sys.platform:
            status = agent_util.SUPPORTED
            data = {
                "load_average.1": {
                    "label": "1 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.5": {
                    "label": "5 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.15": {
                    "label": "15 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "usage_percentage": {
                    "label": "Usage percentage",
                    "options": ["Total"],
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "user_usage_percentage": {
                    "label": "User usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "system_usage_percentage": {
                    "label": "System usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "idle_usage_percentage": {
                    "label": "Idle usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
            }
            return data
        elif "hp-ux" in sys.platform:
            status = agent_util.SUPPORTED
            metadata = {
                "load_average.1": {
                    "label": "1 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.5": {
                    "label": "5 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.15": {
                    "label": "15 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "usage_percentage": {
                    "label": "Total Usage percentage",
                    "options": ["Total"],
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "user_usage_percentage": {
                    "label": "User usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "system_usage_percentage": {
                    "label": "System usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "idle_usage_percentage": {
                    "label": "Idle usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
            }

            return metadata

        elif "vmware" in sys.platform:
            status = agent_util.SUPPORTED

            # here we're gathering the CPU cores that we can monitor and adding in a Total aggregation
            cpus = []
            ret, out = agent_util.execute_command(
                'esxcli hardware cpu list | grep "CPU:"'
            )
            tmp_cpus = [x for x in out.split("\n") if x != ""]
            for c in tmp_cpus:
                cpu = "Cpu (%s)" % c.split(":")[1]
                cpus.append(cpu)
            cpus.append("Total")

            data = {
                "load_average.1": {
                    "label": "1 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.5": {
                    "label": "5 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.15": {
                    "label": "15 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "usage_percentage": {
                    "label": "Usage percentage",
                    "options": cpus,
                    "status": status,
                    "error_message": msg,
                    "min_value": 0,
                    "max_value": 100,
                    "unit": "percent",
                },
                "idle_usage_percentage": {
                    "label": "Idle usage percentage",
                    "options": cpus,
                    "status": status,
                    "error_message": msg,
                    "min_value": 0,
                    "max_value": 100,
                    "unit": "percent",
                },
            }

            return data

        else:
            if psutil is None:
                # Unable to import psutil
                self.log.info(
                    "Unable to import psutil library, no process metrics available"
                )
                status = agent_util.UNSUPPORTED
                msg = "Unable to import psutil library, please install and rebuild metadata"
            # Core Linux
            if not agent_util.which("top", exc=False):
                self.log.info("top binary not found")
                status = agent_util.UNSUPPORTED
                msg = "top binary not found"

            try:
                distro_info = platform.dist()
            except AttributeError:
                if distro:
                    distro_info = distro.linux_distribution()
                    distro_info = ". ".join(distro_info)
                else:
                    raise ValueError(
                        "Unable to grab distribution information. Please verify dependencies. Distro for Python3.8"
                    )

            if (
                "centos" in distro_info
                or "redhat" in distro_info
                or "oracle" in distro_info
            ) and not agent_util.which("iostat", exc=False):
                self.log.info("Missing sysstat package.")
                status = agent_util.UNSUPPORTED
                msg = "iostat/sysstat binary not found. Please install"

            metadata = {
                "load_average.1": {
                    "label": "1 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.5": {
                    "label": "5 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "load_average.15": {
                    "label": "15 minute CPU load average",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "avg",
                },
                "usage_percentage": {
                    "label": "Usage percentage",
                    "options": sorted(get_cpu_metrics(self).keys()),
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "user_usage_percentage": {
                    "label": "User usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "system_usage_percentage": {
                    "label": "System usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "idle_usage_percentage": {
                    "label": "Idle usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "iowait_usage_percentage": {
                    "label": "I/O Wait usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "irq_usage_percentage": {
                    "label": "Hardware IRQ usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "softirg_usage_percentage": {
                    "label": "Software IRQ usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "stealtime_usage_percentage": {
                    "label": "Steal Time usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
                "nice_usage_percentage": {
                    "label": "Nice usage percentage",
                    "options": None,
                    "status": status,
                    "error_message": msg,
                    "unit": "percent",
                },
            }

            return metadata

    def check(self, textkey, data, config={}):
        # AIX-specific logic
        if (
            "aix" in sys.platform
            or "darwin" in sys.platform
            or "freebsd" in sys.platform
        ):
            if textkey.startswith("load_average"):
                retcode, load = agent_util.execute_command("uptime")
                fields = load.strip().split()
                if textkey == "load_average.1":
                    return float(fields[-3].strip(","))
                elif textkey == "load_average.5":
                    return float(fields[-2].strip(","))
                elif textkey == "load_average.15":
                    return float(fields[-1])
                else:
                    return None

            else:
                iostat = str(agent_util.which("iostat"))
                if "aix" in sys.platform:
                    retcode, output = agent_util.execute_command(
                        iostat + " | grep -p tty"
                    )
                if "darwin" in sys.platform or "freebsd" in sys.platform:
                    retcode, output = agent_util.execute_command(
                        iostat + " -C -c 2 | tail -1"
                    )

                output = output.strip().split("\n")
                self.log.debug("iostat output: %s" % output)
                iostatline = False
                enti = False
                entc = 0
                inuse = 0
                user = 0
                system = 0
                idle = 0
                iowait = 0
                for line in output:
                    if line.startswith("tty"):
                        iostatline = True
                        if "entc" in line.split()[-1]:
                            enti = True
                        continue
                    fields = line.split()
                    if "darwin" in sys.platform:
                        user = float(fields[-6])
                        system = float(fields[-5])
                        idle = float(fields[-4])
                    elif "freebsd" in sys.platform:
                        user = float(-5)
                        idle = float(fields[-1])
                        system = float(fields[-3])
                    else:
                        user = float(fields[2])
                        system = float(fields[3])
                        idle = float(fields[4])
                        iowait = float(fields[5])
                        if enti == True:
                            entc = float(fields[-1])
                    inuse = 100.0 - idle

                if textkey == "usage_percentage":
                    return inuse
                elif textkey == "user_usage_percentage":
                    return user
                elif textkey == "system_usage_percentage":
                    return system
                elif textkey == "idle_usage_percentage":
                    return idle
                elif textkey == "iowait_usage_percentage":
                    return iowait
                elif textkey == "cpu_entitlement_percentage" and enti == True:
                    return entc

            # If we got here, we don't know how to gather this metric
            # for AIX - return None
            return None

        elif "sunos" in sys.platform:
            if textkey.startswith("load_average"):
                retcode, load = agent_util.execute_command("uptime")
                fields = load.strip().split()
                if textkey == "load_average.1":
                    return float(fields[-3].strip(","))
                elif textkey == "load_average.5":
                    return float(fields[-2].strip(","))
                elif textkey == "load_average.15":
                    return float(fields[-1])
                else:
                    return None

            retcode, output = agent_util.execute_command("mpstat")
            output = output.split("\n")
            for line in output:
                if "CPU" in line or not line:
                    continue
                fields = line.split()
                if textkey == "usage_percentage":
                    return 100.0 - float(fields[-1])
                elif textkey == "user_usage_percentage":
                    return float(fields[-4])
                elif textkey == "system_usage_percentage":
                    return float(fields[-3])
                elif textkey == "idle_usage_percentage":
                    return float(fields[-1])
                elif textkey == "iowait_usage_percentage":
                    return float(fields[-2])

            # If we got here we don't know how to gather this metric for Solaris
            return None

        elif "vmware" in sys.platform:
            hostname = socket.gethostname()
            search_string = "\\\\%s\\Physical " % hostname
            metric_value = None

            # actually gather the data to parse
            ret, out = agent_util.execute_command(
                "esxtop -b -n 2 -d 2", cache_timeout=agent_util.DEFAULT_CACHE_TIMEOUT
            )
            out_list = out.split("\n")
            headers = out_list[0].replace('"', "").split(",")
            esxtop_data = []
            for idx, val in enumerate(out_list[::1]):
                if not val or val == "":
                    continue
                esxtop_data = out_list[idx].replace('"', "").split(",")

            # finish building search string
            if textkey.startswith("load_average"):
                search_string += (
                    "Cpu Load\\Cpu Load (%s Minute Avg)" % textkey.split(".")[-1]
                )
            elif data and (
                textkey == "usage_percentage" or textkey == "idle_usage_percentage"
            ):
                if data == "Total":
                    search_string += "Cpu(_Total)"
                else:
                    search_string += data

                search_string += "\\% Processor Time"

            # find index from headers and match to esxtop_data collected
            search_idx = search_esxtop(headers, search_string)
            if not search_idx:
                self.log.error("Unable to parse ESXTOP output for %s" % search_string)
                return None

            if textkey == "idle_usage_percentage":
                metric_value = 100 - float(esxtop_data[search_idx])
            else:
                metric_value = float(esxtop_data[search_idx])

            return metric_value

        elif "hp-ux" in sys.platform:
            # add terminal specification for hpux
            os.environ["TERM"] = "xterm"
            # !!! applicable to HP-UX 11.31 !!!
            ret, out = agent_util.execute_command("top -s2 -d2", env=os.environ)
            top = out.strip().splitlines()
            self.log.debug(top)
            metric_mapping = {}
            cpu_str = ""
            load_str = ""
            for line in top:
                if line.lower().startswith("avg"):
                    cpu_str = line
                elif line.lower().startswith("load averages"):
                    load_str = line

            cpu = cpu_str.replace("%", "").split()
            self.log.debug(cpu)
            metric_mapping["user_usage_percentage"] = float(cpu[2])
            metric_mapping["system_usage_percentage"] = float(cpu[4])
            metric_mapping["idle_usage_percentage"] = float(cpu[5])
            metric_mapping["usage_percentage"] = (
                100.0 - metric_mapping["idle_usage_percentage"]
            )

            load = load_str.strip().replace(",", "").split()
            self.log.debug(load)
            self.log.debug("'%s'" % load[4][:4])
            metric_mapping["load_average.1"] = float(load[2])
            metric_mapping["load_average.5"] = float(load[3])
            metric_mapping["load_average.15"] = float(load[4][:4])

            return float(metric_mapping.get(textkey, None))

        else:
            if psutil is None:
                self.log.error("PSUTIL PACKAGE MISSING! UNABLE TO COLLECT CPU METRICS")
                return None
            # Default Linux/FreeBSD logic
            if textkey.startswith("load_average"):
                retcode, output = agent_util.execute_command("top -b -n 2 -d 0.5")
                if config.get("debug", False):
                    self.log.debug(
                        "#####################################################"
                    )
                    self.log.debug("CPU usage command 'top -b -n 2 -d 0.5:")
                    self.log.debug(str(output))
                    self.log.debug(
                        "#####################################################"
                    )
                self.log.debug("top -b -n 2 -d 0.5: %s" % str(output))
                output = output.splitlines()
                space_index = [0]

                for var, item in enumerate(output):
                    if item == "":
                        space_index.append(var)

                tmp_out = []
                for line in output[space_index[2] :]:
                    if line.strip():
                        tmp_out.append(line)
                output = tmp_out

                if textkey.startswith("load_average"):
                    fields = output[0].split()
                    if textkey == "load_average.1":
                        index = -3
                    elif textkey == "load_average.5":
                        index = -2
                    elif textkey == "load_average.15":
                        index = -1
                    return float(fields[index].strip(","))

            elif textkey.endswith("usage_percentage") and textkey != "usage_percentage":
                num_cores = psutil.cpu_count()
                usage_textkey_map = {
                    "user_usage_percentage": "user",
                    "system_usage_percentage": "system",
                    "idle_usage_percentage": "idle",
                    "iowait_usage_percentage": "iowait",
                    "irq_usage_percentage": "irq",
                    "softirg_usage_percentage": "softirq",
                    "stealtime_usage_percentage": "steal",
                    "nice_usage_percentage": "nice",
                }
                key_name = usage_textkey_map.get(textkey, None)
                if key_name is None:
                    self.log.error("Unknown resource textkey '%s'!" % textkey)
                    return None

                c = self.get_cache_results("psutil", "detailed_cpu_usage")
                self.log.debug("Retrieved cached value:\n%s" % c)
                cur_cpu = psutil.cpu_times()
                self.log.debug(
                    "Retrieved instant value:\n%s" % getattr(cur_cpu, key_name)
                )
                last_cpu = c and c[0][1] or None
                self.cache_result("psutil", "detailed_cpu_usage", cur_cpu, replace=True)
                if last_cpu is None:
                    return None
                use_diff = (
                    getattr(cur_cpu, key_name) - getattr(last_cpu, key_name)
                ) / num_cores
                if use_diff < 0:
                    # The system was likely rebooted, and the cached
                    # CPU stats are no longer relevant.
                    # Cache new values and exit without reporting a value.
                    return None
                elapsed = c[0][0]
                usage_time = (use_diff / elapsed) * 100.0
                return usage_time

            elif textkey == "usage_percentage" and data.lower() == "total":
                num_cores = psutil.cpu_count()
                c = self.get_cache_results("psutil", "total_cpu_usage")
                self.log.debug("Retrieved cached value:\n%s" % c)
                cur_cpu = psutil.cpu_times()
                self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle)
                last_cpu = c and c[0][1] or None
                self.cache_result("psutil", "total_cpu_usage", cur_cpu, replace=True)
                if last_cpu is None:
                    return None

                idle_diff = (cur_cpu.idle - last_cpu.idle) / num_cores
                steal_diff = (cur_cpu.steal - last_cpu.steal) / num_cores
                if idle_diff < 0 or steal_diff < 0:
                    # The system was likely rebooted, and the cached
                    # CPU stats are no longer relevant.
                    # Cache new values and exit without reporting a value.
                    return None
                use_diff = idle_diff + steal_diff
                # Instead of using the time between cached calculate the exact time
                # between measures by substracting the sum of the current clock time vs
                # the previous clock time. This avoid issues where our usage was too small
                # and the seconds of the extra cache would give a negative result
                elapsed = (sum(cur_cpu) - sum(last_cpu)) / float(num_cores)
                usage_time = 100 - ((use_diff / elapsed) * 100.0)
                return usage_time

            elif textkey == "usage_percentage" and data.lower() != "total":
                self.log.debug("Checking for core %s" % data)
                num_cores = psutil.cpu_count()
                c = self.get_cache_results("psutil", "%s_cpu_usage" % data)
                self.log.debug("Retrieved cached value:\n%s" % c)
                try:
                    cur_cpu = psutil.cpu_times(percpu=True)[int(str(data).strip("cpu"))]
                except IndexError:
                    self.log.critical("UNABLE TO FIND CPU #%s" % data)
                    return None
                self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle)
                last_cpu = c and c[0][1] or None
                self.cache_result(
                    "psutil", "%s_cpu_usage" % data, cur_cpu, replace=True
                )
                if last_cpu is None:
                    return None
                idle_diff = cur_cpu.idle - last_cpu.idle
                steal_diff = cur_cpu.steal - last_cpu.steal
                if idle_diff < 0 or steal_diff < 0:
                    # The system was likely rebooted, and the cached
                    # CPU stats are no longer relevant.
                    # Cache new values and exit without reporting a value.
                    return None
                use_diff = idle_diff + steal_diff
                elapsed = sum(cur_cpu) - sum(last_cpu)
                usage_time = 100 - ((use_diff / elapsed) * 100.0)
                return usage_time

            return 0