diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f736082656..ef28b979e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -135,3 +135,12 @@ Doc: update Developer.md We enabled [stale bot](https://github.com/probot/stale) for house keeping. An Issue or Pull Request becomes stale if no any inactivity for 60 days. + +## For Mac and Windows user + +kepler currently focus on linux platform. +for other platforms, to make kepler is easy for anyone contributes from +any platform, we are welcome any benefits(PRs) for kepler including +parts as compilable on other platform. +before the specific platform is supported, we just running CI on linux +as PR merge standard and official support. diff --git a/cmd/exporter/exporter.go b/cmd/exporter/exporter.go index 35efa6da23..60534967e3 100644 --- a/cmd/exporter/exporter.go +++ b/cmd/exporter/exporter.go @@ -152,7 +152,7 @@ func main() { platform.InitPowerImpl() defer platform.StopPower() - if config.EnabledGPU() { + if config.IsGPUEnabled() { r := accelerator.GetRegistry() if a, err := accelerator.New(config.GPU, true); err == nil { r.MustRegister(a) // Register the accelerator with the registry diff --git a/pkg/bpf/exporter.go b/pkg/bpf/exporter.go index 5967e20f9a..d736c9eb20 100644 --- a/pkg/bpf/exporter.go +++ b/pkg/bpf/exporter.go @@ -1,3 +1,6 @@ +//go:build !darwin +// +build !darwin + /* Copyright 2021. diff --git a/pkg/bpf/fake_mac.go b/pkg/bpf/fake_mac.go new file mode 100644 index 0000000000..cdcb319f7e --- /dev/null +++ b/pkg/bpf/fake_mac.go @@ -0,0 +1,293 @@ +//go:build darwin +// +build darwin + +/* +Copyright 2021. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package bpf + +import ( + "errors" + "fmt" + "os" + "runtime" + "time" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/link" + "github.com/cilium/ebpf/rlimit" + "github.com/jaypipes/ghw" + "github.com/sustainable-computing-io/kepler/pkg/config" + "golang.org/x/sys/unix" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog/v2" +) + +type exporter struct { + bpfObjects keplerObjects + + schedSwitchLink link.Link + irqLink link.Link + pageWriteLink link.Link + pageReadLink link.Link + + perfEvents *hardwarePerfEvents + + enabledHardwareCounters sets.Set[string] + enabledSoftwareCounters sets.Set[string] +} + +func NewExporter() (Exporter, error) { + e := &exporter{ + enabledHardwareCounters: sets.New[string](config.BPFHwCounters()...), + enabledSoftwareCounters: sets.New[string](config.BPFSwCounters()...), + } + err := e.attach() + if err != nil { + e.Detach() + } + return e, err +} + +func (e *exporter) SupportedMetrics() SupportedMetrics { + return SupportedMetrics{ + HardwareCounters: e.enabledHardwareCounters.Clone(), + SoftwareCounters: e.enabledSoftwareCounters.Clone(), + } +} + +func (e *exporter) attach() error { + // Remove resource limits for kernels <5.11. + if err := rlimit.RemoveMemlock(); err != nil { + return fmt.Errorf("error removing memlock: %v", err) + } + + // Load eBPF Specs + specs, err := loadKepler() + if err != nil { + return fmt.Errorf("error loading eBPF specs: %v", err) + } + + // Adjust map sizes to the number of available CPUs + numCPU := getCPUCores() + klog.Infof("Number of CPUs: %d", numCPU) + for _, m := range specs.Maps { + // Only resize maps that have a MaxEntries of NUM_CPUS constant + if m.MaxEntries == 128 { + m.MaxEntries = uint32(numCPU) + } + } + + // Set program global variables + err = specs.RewriteConstants(map[string]interface{}{ + "SAMPLE_RATE": int32(config.GetBPFSampleRate()), + }) + if err != nil { + return fmt.Errorf("error rewriting program constants: %v", err) + } + + // Load the eBPF program(s) + if err := specs.LoadAndAssign(&e.bpfObjects, nil); err != nil { + return fmt.Errorf("error loading eBPF objects: %v", err) + } + + // Attach the eBPF program(s) + e.schedSwitchLink, err = link.AttachTracing(link.TracingOptions{ + Program: e.bpfObjects.KeplerSchedSwitchTrace, + AttachType: ebpf.AttachTraceRawTp, + }) + if err != nil { + return fmt.Errorf("error attaching sched_switch tracepoint: %v", err) + } + + if config.ExposeIRQCounterMetrics() { + e.irqLink, err = link.AttachTracing(link.TracingOptions{ + Program: e.bpfObjects.KeplerIrqTrace, + AttachType: ebpf.AttachTraceRawTp, + }) + if err != nil { + return fmt.Errorf("could not attach irq/softirq_entry: %w", err) + } + } + + group := "writeback" + name := "writeback_dirty_page" + if _, err := os.Stat("/sys/kernel/debug/tracing/events/writeback/writeback_dirty_folio"); err == nil { + name = "writeback_dirty_folio" + } + e.pageWriteLink, err = link.Tracepoint(group, name, e.bpfObjects.KeplerWritePageTrace, nil) + if err != nil { + klog.Warningf("failed to attach tp/%s/%s: %v. Kepler will not collect page cache write events. This will affect the DRAM power model estimation on VMs.", group, name, err) + } else { + e.enabledSoftwareCounters[config.PageCacheHit] = struct{}{} + } + + e.pageReadLink, err = link.AttachTracing(link.TracingOptions{ + Program: e.bpfObjects.KeplerReadPageTrace, + AttachType: ebpf.AttachTraceFEntry, + }) + if err != nil { + klog.Warningf("failed to attach fentry/mark_page_accessed: %v. Kepler will not collect page cache read events. This will affect the DRAM power model estimation on VMs.", err) + } + + // Return early if hardware counters are not enabled + if !config.ExposeHardwareCounterMetrics() { + klog.Infof("Hardware counter metrics are disabled") + return nil + } + + e.perfEvents, err = createHardwarePerfEvents( + e.bpfObjects.CpuInstructionsEventReader, + e.bpfObjects.CpuCyclesEventReader, + e.bpfObjects.CacheMissEventReader, + numCPU, + ) + if err != nil { + return nil + } + + return nil +} + +func (e *exporter) Detach() { + // Links + if e.schedSwitchLink != nil { + e.schedSwitchLink.Close() + e.schedSwitchLink = nil + } + + if e.irqLink != nil { + e.irqLink.Close() + e.irqLink = nil + } + + if e.pageWriteLink != nil { + e.pageWriteLink.Close() + e.pageWriteLink = nil + } + + if e.pageReadLink != nil { + e.pageReadLink.Close() + e.pageReadLink = nil + } + + // Perf events + e.perfEvents.close() + e.perfEvents = nil + + // Objects + e.bpfObjects.Close() +} + +func (e *exporter) CollectProcesses() ([]ProcessMetrics, error) { + start := time.Now() + // Get the max number of entries in the map + maxEntries := e.bpfObjects.Processes.MaxEntries() + total := 0 + deleteKeys := make([]uint32, maxEntries) + deleteValues := make([]ProcessMetrics, maxEntries) + var cursor ebpf.MapBatchCursor + for { + count, err := e.bpfObjects.Processes.BatchLookupAndDelete( + &cursor, + deleteKeys, + deleteValues, + &ebpf.BatchOptions{}, + ) + total += count + if errors.Is(err, ebpf.ErrKeyNotExist) { + break + } + if err != nil { + return nil, fmt.Errorf("failed to batch lookup and delete: %v", err) + } + } + klog.V(5).Infof("collected %d process samples in %v", total, time.Since(start)) + return deleteValues[:total], nil +} + +/////////////////////////////////////////////////////////////////////////// +// utility functions + +func unixOpenPerfEvent(typ, conf, cpuCores int) ([]int, error) { + return []int{}, nil +} + +func unixClosePerfEvents(fds []int) { + for _, fd := range fds { + _ = unix.SetNonblock(fd, true) + unix.Close(fd) + } +} + +func getCPUCores() int { + cores := runtime.NumCPU() + if cpu, err := ghw.CPU(); err == nil { + // we need to get the number of all CPUs, + // so if /proc/cpuinfo is available, we can get the number of all CPUs + cores = int(cpu.TotalThreads) + } + return cores +} + +type hardwarePerfEvents struct { + cpuCyclesPerfEvents []int + cpuInstructionsPerfEvents []int + cacheMissPerfEvents []int +} + +func (h *hardwarePerfEvents) close() { + unixClosePerfEvents(h.cpuCyclesPerfEvents) + unixClosePerfEvents(h.cpuInstructionsPerfEvents) + unixClosePerfEvents(h.cacheMissPerfEvents) +} + +// CreateHardwarePerfEvents creates perf events for CPU cycles, CPU instructions, and cache misses +// and updates the corresponding eBPF maps. +func createHardwarePerfEvents(cpuInstructionsMap, cpuCyclesMap, cacheMissMap *ebpf.Map, numCPU int) (*hardwarePerfEvents, error) { + var err error + events := &hardwarePerfEvents{ + cpuCyclesPerfEvents: []int{}, + cpuInstructionsPerfEvents: []int{}, + cacheMissPerfEvents: []int{}, + } + defer func() { + if err != nil { + unixClosePerfEvents(events.cpuCyclesPerfEvents) + unixClosePerfEvents(events.cpuInstructionsPerfEvents) + unixClosePerfEvents(events.cacheMissPerfEvents) + } + }() + for i, fd := range events.cpuCyclesPerfEvents { + if err = cpuCyclesMap.Update(uint32(i), uint32(fd), ebpf.UpdateAny); err != nil { + klog.Warningf("Failed to update cpu_cycles_event_reader map: %v", err) + return nil, err + } + } + for i, fd := range events.cpuInstructionsPerfEvents { + if err = cpuInstructionsMap.Update(uint32(i), uint32(fd), ebpf.UpdateAny); err != nil { + klog.Warningf("Failed to update cpu_instructions_event_reader map: %v", err) + return nil, err + } + } + for i, fd := range events.cacheMissPerfEvents { + if err = cacheMissMap.Update(uint32(i), uint32(fd), ebpf.UpdateAny); err != nil { + klog.Warningf("Failed to update cache_miss_event_reader map: %v", err) + return nil, err + } + } + return events, nil +} diff --git a/pkg/bpftest/bpf_suite_test.go b/pkg/bpftest/bpf_suite_test.go index 74b911431f..5af8619839 100644 --- a/pkg/bpftest/bpf_suite_test.go +++ b/pkg/bpftest/bpf_suite_test.go @@ -1,3 +1,6 @@ +//go:build !darwin +// +build !darwin + package bpftest import ( diff --git a/pkg/collector/energy/node_energy_collector.go b/pkg/collector/energy/node_energy_collector.go index 763dc6a6bb..dc7fc67f94 100644 --- a/pkg/collector/energy/node_energy_collector.go +++ b/pkg/collector/energy/node_energy_collector.go @@ -66,7 +66,7 @@ func UpdateNodeComponentsEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) // UpdateNodeGPUEnergy updates each GPU power consumption. Right now we don't support other types of accelerators func UpdateNodeGPUEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) { defer wg.Done() - if config.EnabledGPU() { + if config.IsGPUEnabled() { if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { gpuEnergy := gpu.Device().AbsEnergyFromDevice() for gpu, energy := range gpuEnergy { diff --git a/pkg/collector/metric_collector.go b/pkg/collector/metric_collector.go index cc704a2699..9952c200ee 100644 --- a/pkg/collector/metric_collector.go +++ b/pkg/collector/metric_collector.go @@ -158,7 +158,7 @@ func (c *Collector) updateProcessResourceUtilizationMetrics(wg *sync.WaitGroup) // update process metrics regarding the resource utilization to be used to calculate the energy consumption // we first updates the bpf which is responsible to include new processes in the ProcessStats collection resourceBpf.UpdateProcessBPFMetrics(c.bpfExporter, c.ProcessStats) - if config.EnabledGPU() { + if config.IsGPUEnabled() { if acc.GetActiveAcceleratorByType(config.GPU) != nil { accelerator.UpdateProcessGPUUtilizationMetrics(c.ProcessStats) } diff --git a/pkg/collector/resourceutilization/bpf/fake_mac.go b/pkg/collector/resourceutilization/bpf/fake_mac.go new file mode 100644 index 0000000000..a586db62fa --- /dev/null +++ b/pkg/collector/resourceutilization/bpf/fake_mac.go @@ -0,0 +1,13 @@ +//go:build darwin +// +build darwin + +package bpf + +import ( + "github.com/sustainable-computing-io/kepler/pkg/bpf" + "github.com/sustainable-computing-io/kepler/pkg/collector/stats" +) + +func UpdateProcessBPFMetrics(bpfExporter bpf.Exporter, processStats map[uint64]*stats.ProcessStats) { + +} diff --git a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go index de66821fed..e75079b944 100644 --- a/pkg/collector/resourceutilization/bpf/process_bpf_collector.go +++ b/pkg/collector/resourceutilization/bpf/process_bpf_collector.go @@ -1,3 +1,6 @@ +//go:build !darwin +// +build !darwin + /* Copyright 2021. diff --git a/pkg/collector/resourceutilization/bpf/process_bpf_collector_test.go b/pkg/collector/resourceutilization/bpf/process_bpf_collector_test.go index a98430e9a6..05f23738d1 100644 --- a/pkg/collector/resourceutilization/bpf/process_bpf_collector_test.go +++ b/pkg/collector/resourceutilization/bpf/process_bpf_collector_test.go @@ -1,3 +1,6 @@ +//go:build !darwin +// +build !darwin + package bpf import ( diff --git a/pkg/collector/stats/node_stats.go b/pkg/collector/stats/node_stats.go index 7ab80fe5ba..e5a2455d82 100644 --- a/pkg/collector/stats/node_stats.go +++ b/pkg/collector/stats/node_stats.go @@ -50,7 +50,7 @@ func (ne *NodeStats) ResetDeltaValues() { func (ne *NodeStats) UpdateIdleEnergyWithMinValue(isComponentsSystemCollectionSupported bool) { // gpu metric - if config.EnabledGPU() { + if config.IsGPUEnabled() { if acc.GetActiveAcceleratorByType(config.GPU) != nil { ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUComputeUtilization) } diff --git a/pkg/collector/stats/stats.go b/pkg/collector/stats/stats.go index 2217345803..ca9421d846 100644 --- a/pkg/collector/stats/stats.go +++ b/pkg/collector/stats/stats.go @@ -81,7 +81,7 @@ func NewStats() *Stats { stats.ResourceUsage[metricName] = types.NewUInt64StatCollection() } - if config.EnabledGPU() { + if config.IsGPUEnabled() { if acc.GetActiveAcceleratorByType(config.GPU) != nil { stats.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection() stats.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection() @@ -140,7 +140,7 @@ func (s *Stats) UpdateDynEnergy() { s.CalcDynEnergy(config.AbsEnergyInPlatform, config.IdleEnergyInPlatform, config.DynEnergyInPlatform, sensorID) } // GPU metric - if config.EnabledGPU() { + if config.IsGPUEnabled() { if acc.GetActiveAcceleratorByType(config.GPU) != nil { for gpuID := range s.EnergyUsage[config.AbsEnergyInGPU] { s.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID) diff --git a/pkg/collector/stats/utils.go b/pkg/collector/stats/utils.go index 5c9158e8da..ceea5cef26 100644 --- a/pkg/collector/stats/utils.go +++ b/pkg/collector/stats/utils.go @@ -30,7 +30,7 @@ func GetProcessFeatureNames() []string { klog.V(3).Infof("Available ebpf counters: %v", metrics) // gpu metric - if config.EnabledGPU() { + if config.IsGPUEnabled() { if acc.GetActiveAcceleratorByType(config.GPU) != nil { gpuMetrics := []string{config.GPUComputeUtilization, config.GPUMemUtilization} metrics = append(metrics, gpuMetrics...) diff --git a/pkg/config/config.go b/pkg/config/config.go index 3785de6279..0f1b653791 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -319,7 +319,7 @@ func SetEnabledEBPFCgroupID(enabled bool) { // SetEnabledHardwareCounterMetrics enables the exposure of hardware counter metrics func SetEnabledHardwareCounterMetrics(enabled bool) { // set to false is any config source set it to false - instance.Kepler.ExposeHardwareCounterMetrics = enabled && instance.Kepler.ExposeHardwareCounterMetrics + instance.Kepler.ExposeHardwareCounterMetrics = enabled } // SetEnabledIdlePower allows enabling idle power exposure in Kepler's metrics. When direct power metrics access is available, @@ -331,7 +331,7 @@ func SetEnabledHardwareCounterMetrics(enabled bool) { // Know the number of running VMs becomes crucial for achieving a fair distribution of idle power, particularly when following the GHG (Greenhouse Gas) protocol. func SetEnabledIdlePower(enabled bool) { // set to true is any config source set it to true or if system power metrics are available - instance.Kepler.ExposeIdlePowerMetrics = enabled || instance.Kepler.ExposeIdlePowerMetrics + instance.Kepler.ExposeIdlePowerMetrics = enabled if instance.Kepler.ExposeIdlePowerMetrics { klog.Infoln("The Idle power will be exposed. Are you running on Baremetal or using single VM per node?") } @@ -339,8 +339,7 @@ func SetEnabledIdlePower(enabled bool) { // SetEnabledGPU enables the exposure of gpu metrics func SetEnabledGPU(enabled bool) { - // set to true if any config source set it to true - instance.Kepler.EnabledGPU = enabled || instance.Kepler.EnabledGPU + instance.Kepler.EnabledGPU = enabled } func SetModelServerEnable(enabled bool) { @@ -349,8 +348,7 @@ func SetModelServerEnable(enabled bool) { // SetEnabledMSR enables the exposure of MSR metrics func SetEnabledMSR(enabled bool) { - // set to true if any config source set it to true - instance.Kepler.EnabledMSR = enabled || instance.Kepler.EnabledMSR + instance.Kepler.EnabledMSR = enabled } // SetKubeConfig set kubeconfig file @@ -454,15 +452,6 @@ func isCGroupV2(c Client) bool { return !os.IsNotExist(err) } -// Get cgroup version, return 1 or 2 -func GetCGroupVersion() int { - if isCGroupV2(&realSystem{}) { - return 2 - } else { - return 1 - } -} - // InitModelConfigMap initializes map of config from MODEL_CONFIG func InitModelConfigMap() { if instance.Model.ModelConfigValues == nil { @@ -568,7 +557,7 @@ func ExposeHardwareCounterMetrics() bool { return instance.Kepler.ExposeHardwareCounterMetrics } -func EnabledGPU() bool { +func IsGPUEnabled() bool { return instance.Kepler.EnabledGPU } @@ -630,7 +619,7 @@ func ProcessComponentsPowerKey() string { return instance.Model.ProcessComponentsPowerKey } -func APIServerEnabled() bool { +func IsAPIServerEnabled() bool { return instance.Kepler.EnableAPIServer } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 49d6b295a2..1a44dcedb5 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -137,4 +137,60 @@ var _ = Describe("Test Configuration", func() { err = os.Remove(tmpPath) Expect(err).To(BeNil()) }) + It("test init by default", func() { + Config, err := Initialize(".") + Expect(err).NotTo(HaveOccurred()) + Expect(Config.Kepler).NotTo(BeNil()) + Expect(Config.KernelVersion).To(Equal(float32(0))) + Expect(IsExposeProcessStatsEnabled()).To(BeFalse()) + Expect(IsExposeContainerStatsEnabled()).To(BeTrue()) + Expect(IsExposeVMStatsEnabled()).To(BeTrue()) + Expect(IsExposeBPFMetricsEnabled()).To(BeTrue()) + Expect(IsExposeComponentPowerEnabled()).To(BeTrue()) + Expect(ExposeIRQCounterMetrics()).To(BeTrue()) + Expect(GetBPFSampleRate()).To(Equal(0)) + + }) + It("test init by set func and Is Enable functions", func() { + Config, err := Initialize(".") + Expect(err).NotTo(HaveOccurred()) + // test set and is enable functions. + SetEnabledGPU(true) + Expect(Config.Kepler.EnabledGPU).To(BeTrue()) + Expect(IsGPUEnabled()).To(BeTrue()) + SetEnabledGPU(false) + Expect(Config.Kepler.EnabledGPU).To(BeFalse()) + Expect(IsGPUEnabled()).To(BeFalse()) + + SetEnabledMSR(true) + Expect(Config.Kepler.EnabledMSR).To(BeTrue()) + Expect(IsEnabledMSR()).To(BeTrue()) + SetEnabledMSR(false) + Expect(Config.Kepler.EnabledMSR).To(BeFalse()) + Expect(IsEnabledMSR()).To(BeFalse()) + + SetEnableAPIServer(true) + Expect(Config.Kepler.EnableAPIServer).To(BeTrue()) + Expect(IsAPIServerEnabled()).To(BeTrue()) + SetEnableAPIServer(false) + Expect(Config.Kepler.EnableAPIServer).To(BeFalse()) + Expect(IsAPIServerEnabled()).To(BeFalse()) + + SetMachineSpecFilePath("dummy") + Expect(Config.Kepler.MachineSpecFilePath).To(Equal("dummy")) + + SetEnabledIdlePower(true) + Expect(Config.Kepler.ExposeIdlePowerMetrics).To(BeTrue()) + Expect(IsIdlePowerEnabled()).To(BeTrue()) + SetEnabledIdlePower(false) + Expect(Config.Kepler.ExposeIdlePowerMetrics).To(BeFalse()) + Expect(IsIdlePowerEnabled()).To(BeFalse()) + + SetEnabledHardwareCounterMetrics(true) + Expect(Config.Kepler.ExposeHardwareCounterMetrics).To(BeTrue()) + Expect(ExposeHardwareCounterMetrics()).To(BeTrue()) + SetEnabledHardwareCounterMetrics(false) + Expect(Config.Kepler.ExposeHardwareCounterMetrics).To(BeFalse()) + Expect(ExposeHardwareCounterMetrics()).To(BeFalse()) + }) }) diff --git a/pkg/kubernetes/watcher.go b/pkg/kubernetes/watcher.go index a8c9a6b769..3b302b0da9 100644 --- a/pkg/kubernetes/watcher.go +++ b/pkg/kubernetes/watcher.go @@ -106,7 +106,7 @@ func NewObjListWatcher(bpfSupportedMetrics bpf.SupportedMetrics) (*ObjListWatche bpfSupportedMetrics: bpfSupportedMetrics, workqueue: workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()), } - if w.k8sCli == nil || !config.APIServerEnabled() { + if w.k8sCli == nil || !config.IsAPIServerEnabled() { return w, nil } optionsModifier := func(options *metav1.ListOptions) { diff --git a/pkg/metrics/metricfactory/metric_factory.go b/pkg/metrics/metricfactory/metric_factory.go index 9f76994e96..8ba1831382 100644 --- a/pkg/metrics/metricfactory/metric_factory.go +++ b/pkg/metrics/metricfactory/metric_factory.go @@ -86,7 +86,7 @@ func SCMetricsPromDesc(context string, bpfSupportedMetrics bpf.SupportedMetrics) func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) { descriptions = make(map[string]*prometheus.Desc) - if config.EnabledGPU() { + if config.IsGPUEnabled() { if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { for _, name := range consts.GPUMetricNames { descriptions[name] = resMetricsPromDesc(context, name, gpu.Device().Name()) diff --git a/pkg/metrics/utils/utils.go b/pkg/metrics/utils/utils.go index ee52a4f03a..6dcf522548 100644 --- a/pkg/metrics/utils/utils.go +++ b/pkg/metrics/utils/utils.go @@ -36,7 +36,7 @@ func CollectEnergyMetrics(ch chan<- prometheus.Metric, instance interface{}, col if config.IsExposeComponentPowerEnabled() { // collect the dynamic energy metrics for i, collectorName := range consts.EnergyMetricNames { - if collectorName == config.GPU && !config.EnabledGPU() { + if collectorName == config.GPU && !config.IsGPUEnabled() { continue } collectEnergy(ch, instance, consts.DynEnergyMetricNames[i], "dynamic", collectors[collectorName]) @@ -57,7 +57,7 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac for collectorName := range bpfSupportedMetrics.HardwareCounters { CollectResUtil(ch, instance, collectorName, collectors[collectorName]) } - if config.EnabledGPU() { + if config.IsGPUEnabled() { if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { for _, collectorName := range consts.GPUMetricNames { CollectResUtil(ch, instance, collectorName, collectors[collectorName]) diff --git a/pkg/model/process_energy.go b/pkg/model/process_energy.go index 19857919af..7231a608f4 100644 --- a/pkg/model/process_energy.go +++ b/pkg/model/process_energy.go @@ -200,7 +200,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta klog.V(5).Infoln("Could not estimate the Process Components Power") } // estimate the associated power consumption of GPU for each process - if config.EnabledGPU() { + if config.IsGPUEnabled() { if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil { processGPUPower, errGPU = processComponentPowerModel.GetGPUPower(isIdlePower) if errGPU != nil { diff --git a/pkg/sensors/accelerator/devices/dcgm.go b/pkg/sensors/accelerator/devices/dcgm.go index 95ca57e06d..02abf238e7 100644 --- a/pkg/sensors/accelerator/devices/dcgm.go +++ b/pkg/sensors/accelerator/devices/dcgm.go @@ -1,3 +1,6 @@ +//go:build !darwin +// +build !darwin + /* Copyright 2024. diff --git a/pkg/sensors/accelerator/devices/fackmac.go b/pkg/sensors/accelerator/devices/fackmac.go new file mode 100644 index 0000000000..0762de2d53 --- /dev/null +++ b/pkg/sensors/accelerator/devices/fackmac.go @@ -0,0 +1,12 @@ +//go darwin +//go:build darwin + +package devices + +func dcgmCheck(*Registry) { + +} + +func nvmlCheck(*Registry) { + +} diff --git a/pkg/sensors/accelerator/devices/nvml.go b/pkg/sensors/accelerator/devices/nvml.go index f9bf80d3ae..63157cacb6 100644 --- a/pkg/sensors/accelerator/devices/nvml.go +++ b/pkg/sensors/accelerator/devices/nvml.go @@ -1,3 +1,6 @@ +//go:build !darwin +// +build !darwin + /* Copyright 2021-2024 diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 751e22f758..076268c1e6 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -1,3 +1,6 @@ +//go:build !darwin +// +build !darwin + /* Copyright 2021. diff --git a/pkg/utils/utils_darwin.go b/pkg/utils/utils_darwin.go new file mode 100644 index 0000000000..c420c39efa --- /dev/null +++ b/pkg/utils/utils_darwin.go @@ -0,0 +1,98 @@ +//go:build darwin +// +build darwin + +/* +Copyright 2021. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package utils + +import ( + "bufio" + "encoding/binary" + "fmt" + "os" + "strings" + "unsafe" +) + +func CreateTempFile(contents string) (filename string, reterr error) { + f, err := os.CreateTemp("", "") + if err != nil { + return "", err + } + defer func() { + if err = f.Close(); err != nil { + return + } + }() + _, err = f.WriteString(contents) + if err != nil { + return "", err + } + return f.Name(), nil +} + +func CreateTempDir() (dir string, err error) { + return os.MkdirTemp("", "") +} + +func DetermineHostByteOrder() binary.ByteOrder { + var i int32 = 0x01020304 + u := unsafe.Pointer(&i) + pb := (*byte)(u) + b := *pb + if b == 0x04 { + return binary.LittleEndian + } + + return binary.BigEndian +} + +const ( + KernelProcessName string = "kernel_processes" + KernelProcessNamespace string = "kernel" + SystemProcessName string = "system_processes" + SystemProcessNamespace string = "system" + EmptyString string = "" + GenericSocketID string = "socket0" + GenericGPUID string = "gpu" +) + +func GetPathFromPID(searchPath string, pid uint64) (string, error) { + path := fmt.Sprintf(searchPath, pid) + file, err := os.Open(path) + if err != nil { + return "", fmt.Errorf("failed to open cgroup description file for pid %d: %v", pid, err) + } + defer file.Close() + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "pod") || strings.Contains(line, "containerd") || strings.Contains(line, "crio") { + return line, nil + } + } + return "", fmt.Errorf("could not find cgroup description entry for pid %d", pid) +} + +func GetCgroupIDFromPath(byteOrder binary.ByteOrder, path string) (uint64, error) { + return uint64(0), nil +} + +func IsFileExists(path string) bool { + _, err := os.Stat(path) + return !os.IsNotExist(err) +} diff --git a/pkg/version/version_suite_test.go b/pkg/version/version_suite_test.go deleted file mode 100644 index d0a237e1fc..0000000000 --- a/pkg/version/version_suite_test.go +++ /dev/null @@ -1,13 +0,0 @@ -package version - -import ( - "testing" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" -) - -func TestVersion(t *testing.T) { - RegisterFailHandler(Fail) - RunSpecs(t, "Version Suite") -}