diff --git a/MonitoringTools/usr/local/bin/slottifier.py b/MonitoringTools/usr/local/bin/slottifier.py new file mode 100644 index 00000000..7c09d238 --- /dev/null +++ b/MonitoringTools/usr/local/bin/slottifier.py @@ -0,0 +1,190 @@ +#!/usr/bin/python +from typing import List, Dict +import sys +import re +from typing import Dict +import time +import openstack +from slottifier_entry import SlottifierEntry +from send_metric_utils import ( + parse_args, + post_to_influxdb, +) + +UNKNOWN_GPU_NUM_FLAVORS = [] + + +def get_hv_info(hv_name: str, hypervisors: Dict) -> Dict: + """ + Helper function to get hv information on cores/memory available + :param hv_name: hypervisor name to get information from + :param hypervisors: a list of all hypervisors to search in (avoids long search times by getting each one from openstack) + :return: a dictionary of cores/memory available for given hv + """ + hv_info = {"cores_available": 0, "mem_available": 0} + hypervisor = hypervisors.get(hv_name, {}) + if hypervisor and hypervisor["status"] != "disabled": + hv_info["cores_available"] = max(0, hypervisor.get("vcpus", 0) - hypervisor.get("vcpus_used", 0)) + hv_info["mem_available"] = max(0, hypervisor.get("memory_size", 0) - hypervisor.get("memory_used", 0)) + + return hv_info + + +def get_flavor_requirements(flavor) -> Dict: + """ + Helper function to get flavor memory/ram/gpu requirements for a VM of that type to be built on a hv + :param flavor: flavor to get requirements from + :return: dictionary of requirements + """ + return { + "gpus_required": int(flavor["extra_specs"].get("accounting:gpu_num", 0)), + "cores_required": int(flavor.get("vcpus", 0)), + "mem_required": int(flavor.get("ram", 0)) + } + + +def get_valid_flavors_for_hosttype(flavor_list: List, hypervisor_hosttype: str) -> List: + """ + Helper function that filters a list of flavors to find those that can be built on a hypervisor with a given hosttype + :param flavor_list: a list of flavors to check + :param hypervisor_hosttype: specifies the hypervisor hosttype to find compatible flavors for + :return: a list of valid flavors for hosttype + """ + valid_flavors = [] + for flavor in flavor_list: + # validate that flavor can be used on host aggregate + if "aggregate_instance_extra_specs:hosttype" not in flavor["extra_specs"].keys(): + continue + if flavor["extra_specs"]["aggregate_instance_extra_specs:hosttype"] != hypervisor_hosttype: + continue + + valid_flavors.append(flavor) + + return valid_flavors + + +def convert_to_data_string(slots_dict: Dict, instance: str) -> str: + """ + converts a dictionary of values into a data-string influxdb can read + :param slots_dict: a dictionary of slots available for each flavor + :param instance: which cloud the info was scraped from (prod or dev) + :return: a comma-separated string of key=value taken from input dictionary + """ + data_string = "" + for flavor, slot_info in slots_dict.items(): + data_string += ( + f"SlotsAvailable,instance={instance},flavor={flavor}" + f" SlotsAvailable={slot_info.slots_available}" + f",maxSlotsAvailable={slot_info.max_gpu_slots_capacity}" + f",usedSlots={slot_info.estimated_gpu_slots_used}" + f",enabledSlots={slot_info.max_gpu_slots_capacity_enabled}\n" + ) + return data_string + + +def calculate_slots_for_flavor_for_hv(flavor_name, flavor_reqs, hv_info) -> SlottifierEntry: + """ + Helper function that calculates available slots for a flavor on a given hypervisor + :param flavor_name: name of flavor + :param flavor_reqs: dictionary of memory, cpu, and gpu requirements of flavor + :param hv_info: dictionary of memory, cpu, and gpu capacity/availability on hypervisor + and whether hv compute service is enabled + :return: A dataclass holding slottifer information to update with + """ + slots_available = 0 + slots_dataclass = SlottifierEntry() + + if hv_info["compute_service_status"] == "enabled": + slots_available = min( + hv_info["cores_available"] // flavor_reqs["cores_required"], + hv_info["mem_available"] // flavor_reqs["mem_required"] + ) + + if "g-" in flavor_name: + + # workaround for bugs where gpu number not specified + if flavor_reqs["gpus_required"] == 0: + flavor_reqs["gpus_required"] = 1 + if flavor_name not in UNKNOWN_GPU_NUM_FLAVORS: + UNKNOWN_GPU_NUM_FLAVORS.append(flavor_name) + + # if the number of GPUs currently assigned on this host is 0, this is how many slots are available + theoretical_gpu_slots_available = hv_info["gpu_capacity"] // flavor_reqs["gpus_required"] + + # estimated number of GPU slots used - based off of how much cpu/mem is currently being used + slots_dataclass.estimated_gpu_slots_used = hv_info["gpu_capacity"] - slots_available + + slots_dataclass.max_gpu_slots_capacity += hv_info["gpu_capacity"] + + if hv_info["compute_service_status"] == "enabled": + slots_dataclass.max_gpu_slots_capacity_enabled = hv_info["gpu_capacity"] + + slots_available = min(slots_available, theoretical_gpu_slots_available) + + slots_dataclass.slots_available = slots_available + return slots_dataclass + + +def get_slottifier_details(instance: str) -> str: + """ + This function gets calculates slots available for each flavor in openstack and outputs results in + data string format which can be posted to InfluxDB + :param instance: which cloud to calculate slots for + :return: A data string of scraped info + """ + conn = openstack.connect(cloud=instance) + + # we get all openstack info first because it is quicker than getting them one at a time + # dictionaries prevent duplicates + all_compute_services = {service["id"]: service for service in conn.compute.services()} + print("got all compute services") + all_aggregates = {aggregate["id"]: aggregate for aggregate in conn.compute.aggregates()} + print("got all aggregates") + all_hypervisors = {h["name"]: h for h in conn.list_hypervisors()} + print("got all hypervisors") + all_flavors = {flavor["name"]: flavor for flavor in conn.compute.flavors(get_extra_specs=True)} + print("got all flavors") + + slots_dict = {flavor_name: SlottifierEntry() for flavor_name in all_flavors} + for aggregate in all_aggregates.values(): + + hv_hosttype = aggregate["metadata"].get("hosttype", None) + if not hv_hosttype: + continue + + for compute_service in all_compute_services.values(): + hv_info = get_hv_info(compute_service["host"], all_hypervisors) + hv_info["gpu_capacity"] = int(aggregate["metadata"].get("gpunum", 0)) + hv_info["compute_service_status"] = compute_service["status"] + + valid_flavors = get_valid_flavors_for_hosttype(list(all_flavors.values()), hv_hosttype) + for flavor in valid_flavors: + slots_dict[flavor["name"]] += calculate_slots_for_flavor_for_hv( + flavor["name"], + get_flavor_requirements(flavor), + hv_info, + ) + + return convert_to_data_string(slots_dict, instance) + + +def main(influxdb_args: Dict): + """ + send slottifier info to influx + :param influxdb_args: args to connect to influxdb and openstack to scrape info from + """ + post_to_influxdb( + get_slottifier_details(influxdb_args["cloud.instance"]), + host=influxdb_args["db.host"], + db_name=influxdb_args["db.database"], + auth=(influxdb_args["auth.username"], influxdb_args["auth.password"]) + ) + for missing_flavor in UNKNOWN_GPU_NUM_FLAVORS: + print( + f"{missing_flavor} missing metadata property 'extra_specs:accounting:gpu_num'" + "do not know how many GPUs the flavor requires, assuming 1 gpu required" + ) + + +if __name__ == '__main__': + main(parse_args(sys.argv[1:], description="Get Slots Available For All Flavors")) diff --git a/MonitoringTools/usr/local/bin/slottifier_entry.py b/MonitoringTools/usr/local/bin/slottifier_entry.py new file mode 100644 index 00000000..ac65099d --- /dev/null +++ b/MonitoringTools/usr/local/bin/slottifier_entry.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass + +@dataclass +class SlottifierEntry: + """ + A dataclass to hold slottifier information + :param slots_available: Number of slots available for a flavor + :param estimated_gpu_slots_used: Number of gpu slots currently used that could host this flavor + - estimated by amount of cores/mem already used by hvs as there's no way in openstack to find this out directly + :param max_gpu_slots_capacity: Number of gpus available on all compatible hypervisors to build this flavor on + :param max_gpu_slots_capacity_enabled: like max_gpu_slots_capacity, but only counting hosts with nova-compute + service enabled + """ + slots_available: int = 0 + estimated_gpu_slots_used: int = 0 + max_gpu_slots_capacity: int = 0 + max_gpu_slots_capacity_enabled: int = 0 + + def __add__(self, other): + """ + dunder method to add two SlottifierEntry values together. + :param other: Another SlottifierEntry dataclass to add + :return: A SlottifierEntry dataclass where each attribute value from current dataclass and given dataclass are + added together + """ + if not isinstance(other, SlottifierEntry): + raise TypeError(f"Unsupported operand type for +: '{type(self)}' and '{type(other)}'") + + return SlottifierEntry( + *( + self_attr + other_attr for self_attr, other_attr + in zip(self.__dict__.values(), other.__dict__.values()) + ) + ) \ No newline at end of file