Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: gpu cleanup #1704

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,8 @@ CTR_CMD_PUSH_OPTIONS ?=

GENERAL_TAGS := include_gcs include_oss containers_image_openpgp gssapi providerless netgo osusergo
GPU_TAGS :=
ifeq ($(shell ldconfig -p | grep -q libnvml_injection.so && echo exists),exists)
GPU_TAGS := nvml
endif
ifeq ($(shell ldconfig -p | grep -q libdcgm.so && echo exists),exists)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rootfs is it ok to remove the dcgm library check?

Copy link
Collaborator Author

@maryamtahhan maryamtahhan Sep 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is removing the GPU_TAGS := dcgm meaning the dcgm and NVML GPU code will be built into Kepler by default. The device plugins/gpu code checks at runtime for the libraries and don't run if the library doesn't exist.

The only reason I kept the HABANA tag is because there are issues building habana if the library doesn't exist on the system because of their cgo dependency

GPU_TAGS := dcgm
endif
ifeq ($(shell ldconfig -p | grep -q libhlml.so && echo exists),exists)
GPU_TAGS := habana
GPU_TAGS := habana
endif

# set GOENV
Expand Down
2 changes: 1 addition & 1 deletion cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func main() {

if config.EnabledGPU() {
r := accelerator.GetRegistry()
if a, err := accelerator.New(accelerator.GPU, true); err == nil {
if a, err := accelerator.New(config.GPU, true); err == nil {
r.MustRegister(a) // Register the accelerator with the registry
} else {
klog.Errorf("failed to init GPU accelerators: %v", err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/energy/node_energy_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func UpdateNodeComponentsEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup)
func UpdateNodeGPUEnergy(nodeStats *stats.NodeStats, wg *sync.WaitGroup) {
defer wg.Done()
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
gpuEnergy := gpu.Device().AbsEnergyFromDevice()
for gpu, energy := range gpuEnergy {
nodeStats.EnergyUsage[config.AbsEnergyInGPU].SetDeltaStat(fmt.Sprintf("%d", gpu), uint64(energy))
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/metric_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ func (c *Collector) updateProcessResourceUtilizationMetrics(wg *sync.WaitGroup)
// we first updates the bpf which is responsible to include new processes in the ProcessStats collection
resourceBpf.UpdateProcessBPFMetrics(c.bpfExporter, c.ProcessStats)
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
accelerator.UpdateProcessGPUUtilizationMetrics(c.ProcessStats, c.bpfSupportedMetrics)
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/metric_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (
)

func newMockCollector(mockAttacher bpf.Exporter) *Collector {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
d := gpu.Device()
err := d.Init() // create structure instances that will be accessed to create a containerMetric
Expect(err).NotTo(HaveOccurred())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (
"github.com/sustainable-computing-io/kepler/pkg/config"
"github.com/sustainable-computing-io/kepler/pkg/libvirt"
acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator"
dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/device"
dev "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator/devices"
"k8s.io/klog/v2"

"github.com/sustainable-computing-io/kepler/pkg/utils"
Expand All @@ -44,7 +44,7 @@ var (

// UpdateProcessGPUUtilizationMetrics reads the GPU metrics of each process using the GPU
func UpdateProcessGPUUtilizationMetrics(processStats map[uint64]*stats.ProcessStats, bpfSupportedMetrics bpf.SupportedMetrics) {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
d := gpu.Device()
migDevices := d.DeviceInstances()
for _, _device := range d.DevicesByID() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/node_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func (ne *NodeStats) ResetDeltaValues() {
func (ne *NodeStats) UpdateIdleEnergyWithMinValue(isComponentsSystemCollectionSupported bool) {
// gpu metric
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
ne.CalcIdleEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.GPUComputeUtilization)
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/collector/stats/stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func NewStats(bpfSupportedMetrics bpf.SupportedMetrics) *Stats {
}

if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
stats.ResourceUsage[config.GPUComputeUtilization] = types.NewUInt64StatCollection()
stats.ResourceUsage[config.GPUMemUtilization] = types.NewUInt64StatCollection()
stats.ResourceUsage[config.IdleEnergyInGPU] = types.NewUInt64StatCollection()
Expand Down Expand Up @@ -147,7 +147,7 @@ func (s *Stats) UpdateDynEnergy() {
}
// GPU metric
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
for gpuID := range s.EnergyUsage[config.AbsEnergyInGPU] {
s.CalcDynEnergy(config.AbsEnergyInGPU, config.IdleEnergyInGPU, config.DynEnergyInGPU, gpuID)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ const (
// TODO: do not use a fixed usageMetric array in the power models, a structured data is more disarable.
func SetMockedCollectorMetrics() {
config.GetConfig()
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
err := gpu.Device().Init() // create structure instances that will be accessed to create a processMetric
klog.Fatalln(err)
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/collector/stats/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func GetProcessFeatureNames(bpfSupportedMetrics bpf.SupportedMetrics) []string {

// gpu metric
if config.EnabledGPU() {
if acc.GetRegistry().ActiveAcceleratorByType(acc.GPU) != nil {
if acc.GetActiveAcceleratorByType(config.GPU) != nil {
gpuMetrics := []string{config.GPUComputeUtilization, config.GPUMemUtilization}
metrics = append(metrics, gpuMetrics...)
klog.V(3).Infof("Available GPU metrics: %v", gpuMetrics)
Expand Down
4 changes: 2 additions & 2 deletions pkg/metrics/metricfactory/metric_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func EnergyMetricsPromDesc(context string) (descriptions map[string]*prometheus.
// set the default source to trained power model
source := modeltypes.TrainedPowerModelSource
if strings.Contains(name, config.GPU) {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
source = gpu.Device().Name()
}
} else if strings.Contains(name, config.PLATFORM) && platform.IsSystemCollectionSupported() {
Expand Down Expand Up @@ -87,7 +87,7 @@ func SCMetricsPromDesc(context string, bpfSupportedMetrics bpf.SupportedMetrics)
func GPUUsageMetricsPromDesc(context string) (descriptions map[string]*prometheus.Desc) {
descriptions = make(map[string]*prometheus.Desc)
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
for _, name := range consts.GPUMetricNames {
descriptions[name] = resMetricsPromDesc(context, name, gpu.Device().Name())
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/metrics/prometheus_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/sustainable-computing-io/kepler/pkg/bpf"
"github.com/sustainable-computing-io/kepler/pkg/collector"
"github.com/sustainable-computing-io/kepler/pkg/collector/stats"
"github.com/sustainable-computing-io/kepler/pkg/config"
"github.com/sustainable-computing-io/kepler/pkg/model"

acc "github.com/sustainable-computing-io/kepler/pkg/sensors/accelerator"
Expand Down Expand Up @@ -63,7 +64,7 @@ var _ = Describe("Test Prometheus Collector Unit", func() {
// we need to disable the system real time power metrics for testing since we add mock values or use power model estimator
components.SetIsSystemCollectionSupported(false)
platform.SetIsSystemCollectionSupported(false)
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
err := gpu.Device().Init() // create structure instances that will be accessed to create a containerMetric
Expect(err).NotTo(HaveOccurred())
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/metrics/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func CollectResUtilizationMetrics(ch chan<- prometheus.Metric, instance interfac
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
for _, collectorName := range consts.GPUMetricNames {
CollectResUtil(ch, instance, collectorName, collectors[collectorName])
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/model/process_energy.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ func addEstimatedEnergy(processIDList []uint64, processesMetrics map[uint64]*sta
}
// estimate the associated power consumption of GPU for each process
if config.EnabledGPU() {
if gpu := acc.GetRegistry().ActiveAcceleratorByType(acc.GPU); gpu != nil {
if gpu := acc.GetActiveAcceleratorByType(config.GPU); gpu != nil {
processGPUPower, errGPU = processComponentPowerModel.GetGPUPower(isIdlePower)
if errGPU != nil {
klog.V(5).Infoln("Could not estimate the Process GPU Power")
Expand Down
Loading
Loading