Skip to content

Commit

Permalink
chore: remove gpu global vars
Browse files Browse the repository at this point in the history
Signed-off-by: Maryam Tahhan <[email protected]>
  • Loading branch information
maryamtahhan committed Aug 19, 2024
1 parent 2e14406 commit bba5100
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 57 deletions.
48 changes: 24 additions & 24 deletions pkg/sensors/accelerator/devices/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ const (
)

var (
dcgmAccImpl = GPUDcgm{}
dcgmAccImpl = gpuDcgm{}
deviceFields []dcgm.Short = []dcgm.Short{
// https://docs.nvidia.com/datacenter/dcgm/1.7/dcgm-api/group__dcgmFieldIdentifiers.htm
dcgm.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE,
Expand All @@ -46,7 +46,7 @@ var (
dcgmType DeviceType
)

type GPUDcgm struct {
type gpuDcgm struct {
collectionSupported bool
devs map[int]GPUDevice
migDevices map[int]map[int]GPUDevice // list of mig devices for each GPU instance
Expand Down Expand Up @@ -87,7 +87,7 @@ func dcgmDeviceStartup() Device {
return &a
}

func (d *GPUDcgm) Init() error {
func (d *gpuDcgm) Init() error {
if !d.libInited {
if err := d.InitLib(); err != nil {
klog.Errorf("failed to init lib: %v", err)
Expand Down Expand Up @@ -122,7 +122,7 @@ func (d *GPUDcgm) Init() error {
return nil
}

func (d *GPUDcgm) InitLib() (err error) {
func (d *gpuDcgm) InitLib() (err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("could not init dcgm: %v", r)
Expand Down Expand Up @@ -162,7 +162,7 @@ func (d *GPUDcgm) InitLib() (err error) {
return nil
}

func (d *GPUDcgm) loadDevices() error {
func (d *gpuDcgm) loadDevices() error {
d.devs = map[int]GPUDevice{}
count, err := nvml.DeviceGetCount()
if err != nvml.SUCCESS {
Expand All @@ -185,7 +185,7 @@ func (d *GPUDcgm) loadDevices() error {
}

// LoadMIGDevices dynamically discover the MIG instances of all GPUs
func (d *GPUDcgm) LoadMIGDevices() {
func (d *gpuDcgm) LoadMIGDevices() {
d.migDevices = map[int]map[int]GPUDevice{}

// find all GPUs and the MIG slices if they exist
Expand Down Expand Up @@ -238,7 +238,7 @@ func (d *GPUDcgm) LoadMIGDevices() {
}
}

func (d *GPUDcgm) loadMIGProfiles() {
func (d *gpuDcgm) loadMIGProfiles() {
if len(d.devs) == 0 {
klog.Errorln("DCGM has no GPU to monitor")
return
Expand All @@ -258,27 +258,27 @@ func (d *GPUDcgm) loadMIGProfiles() {
}
}

func (d *GPUDcgm) Name() string {
func (d *gpuDcgm) Name() string {
return dcgmType.String()
}

func (d *GPUDcgm) DevType() DeviceType {
func (d *gpuDcgm) DevType() DeviceType {
return dcgmType
}

func (d *GPUDcgm) HwType() string {
func (d *gpuDcgm) HwType() string {
return dcgmHwType
}

func (d *GPUDcgm) IsDeviceCollectionSupported() bool {
func (d *gpuDcgm) IsDeviceCollectionSupported() bool {
return d.collectionSupported
}

func (d *GPUDcgm) SetDeviceCollectionSupported(supported bool) {
func (d *gpuDcgm) SetDeviceCollectionSupported(supported bool) {
d.collectionSupported = supported
}

func (d *GPUDcgm) Shutdown() bool {
func (d *gpuDcgm) Shutdown() bool {
nvml.Shutdown()
dcgm.FieldsTerm()
if d.deviceGroupName != "" {
Expand All @@ -299,7 +299,7 @@ func (d *GPUDcgm) Shutdown() bool {
return true
}

func (d *GPUDcgm) AbsEnergyFromDevice() []uint32 {
func (d *gpuDcgm) AbsEnergyFromDevice() []uint32 {
gpuEnergy := []uint32{}
for _, dev := range d.devs {
power, ret := dev.DeviceHandler.(nvml.Device).GetPowerUsage()
Expand All @@ -315,20 +315,20 @@ func (d *GPUDcgm) AbsEnergyFromDevice() []uint32 {
return gpuEnergy
}

func (d *GPUDcgm) DevicesByID() map[int]any {
func (d *gpuDcgm) DevicesByID() map[int]any {
devs := make(map[int]any)
for id, dev := range d.devs {
devs[id] = dev
}
return devs
}

func (d *GPUDcgm) DevicesByName() map[string]any {
func (d *gpuDcgm) DevicesByName() map[string]any {
devs := make(map[string]any)
return devs
}

func (d *GPUDcgm) DeviceInstances() map[int]map[int]any {
func (d *gpuDcgm) DeviceInstances() map[int]map[int]any {
// LoadMIGDevices
d.LoadMIGDevices()

Expand All @@ -343,13 +343,13 @@ func (d *GPUDcgm) DeviceInstances() map[int]map[int]any {
return devInstances
}

func (d *GPUDcgm) DeviceUtilizationStats(dev any) (map[any]any, error) {
func (d *gpuDcgm) DeviceUtilizationStats(dev any) (map[any]any, error) {
ds := make(map[any]any) // Process Accelerator Metrics
return ds, nil
}

// ProcessResourceUtilizationPerDevice returns the GPU utilization per process. The gpuID can be a MIG instance or the main GPU
func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) {
func (d *gpuDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]any, error) {
processAcceleratorMetrics := map[uint32]GPUProcessUtilizationSample{}
pam := make(map[uint32]any)
// Check if the device is of type dev.GPUDevice and extract the DeviceHandler
Expand Down Expand Up @@ -426,7 +426,7 @@ func (d *GPUDcgm) ProcessResourceUtilizationPerDevice(dev any, since time.Durati
}

// helper functions
func (d *GPUDcgm) initNVML() error {
func (d *gpuDcgm) initNVML() error {
if ret := nvml.Init(); ret != nvml.SUCCESS {
d.collectionSupported = false
d.Shutdown()
Expand All @@ -435,7 +435,7 @@ func (d *GPUDcgm) initNVML() error {
return nil
}

func (d *GPUDcgm) createDeviceGroup() error {
func (d *gpuDcgm) createDeviceGroup() error {
deviceGroupName := "dev-grp-" + time.Now().Format("2006-01-02-15-04-05")
deviceGroup, err := dcgm.CreateGroup(deviceGroupName)
if err != nil {
Expand All @@ -447,7 +447,7 @@ func (d *GPUDcgm) createDeviceGroup() error {
return nil
}

func (d *GPUDcgm) addDevicesToGroup() error {
func (d *gpuDcgm) addDevicesToGroup() error {
for gpuID := range d.devs {
err := dcgm.AddEntityToGroup(d.deviceGroupHandle, dcgm.FE_GPU, uint(gpuID))
if err != nil {
Expand All @@ -463,7 +463,7 @@ func (d *GPUDcgm) addDevicesToGroup() error {
return nil
}

func (d *GPUDcgm) createFieldGroup() error {
func (d *gpuDcgm) createFieldGroup() error {
fieldGroupName := "fld-grp-" + time.Now().Format("2006-01-02-15-04-05")
fieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, deviceFields)
if err != nil {
Expand All @@ -474,7 +474,7 @@ func (d *GPUDcgm) createFieldGroup() error {
return nil
}

func (d *GPUDcgm) setupWatcher() error {
func (d *gpuDcgm) setupWatcher() error {
// watch interval has an impact on cpu usage, set it carefully
err := dcgm.WatchFieldsWithGroupEx(d.fieldGroupHandle, d.deviceGroupHandle, int64(1000)*1000, 0.0, 1)
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/sensors/accelerator/devices/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ func addDeviceInterface(registry *Registry, dtype DeviceType, accType string, de
klog.V(5).Infof("Registered %s", dtype)
}

// Startup initializes and returns a new Device according to the given DeviceType [NVML|DCGM|DUMMY|HABANA].
// Startup initializes and returns a new Device according to the given DeviceType [NVML|DCGM|HABANA].
func Startup(a string) Device {
// Retrieve the global registry
registry := GetRegistry()
Expand Down
32 changes: 16 additions & 16 deletions pkg/sensors/accelerator/devices/habana.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@ const (
)

var (
habanaAccImpl = GPUHabana{}
habanaAccImpl = gpuHabana{}
habanaType DeviceType
)

type GPUHabana struct {
type gpuHabana struct {
collectionSupported bool
devices map[int]interface{}
}
Expand All @@ -66,26 +66,26 @@ func habanaDeviceStartup() Device {
return &a
}

func (g *GPUHabana) Name() string {
func (g *gpuHabana) Name() string {
return habanaType.String()
}

func (g *GPUHabana) DevType() DeviceType {
func (g *gpuHabana) DevType() DeviceType {
return habanaType
}

func (g *GPUHabana) HwType() string {
func (g *gpuHabana) HwType() string {
return habanaAccType
}

func (g *GPUHabana) InitLib() error {
func (g *gpuHabana) InitLib() error {
if _, err := os.Stat(libhlmlpath); errors.Is(err, os.ErrNotExist) {
return err
}
return nil
}

func (g *GPUHabana) Init() error {
func (g *gpuHabana) Init() error {
ret := hlml.Initialize()
if ret != nil {
klog.Error("ERROR initializing hlml")
Expand All @@ -98,14 +98,14 @@ func (g *GPUHabana) Init() error {
return ret
}

func (g *GPUHabana) Shutdown() bool {
func (g *gpuHabana) Shutdown() bool {
if ret := hlml.Shutdown(); ret != nil {
return false
}
return true
}

func (g *GPUHabana) AbsEnergyFromDevice() []uint32 {
func (g *gpuHabana) AbsEnergyFromDevice() []uint32 {
gpuEnergy := []uint32{}
for _, dev := range g.devices {
power, ret := dev.(GPUDevice).DeviceHandler.(hlml.Device).PowerUsage()
Expand All @@ -123,7 +123,7 @@ func (g *GPUHabana) AbsEnergyFromDevice() []uint32 {
return gpuEnergy
}

func (g *GPUHabana) DevicesByID() map[int]interface{} {
func (g *gpuHabana) DevicesByID() map[int]interface{} {
// Get the count of available devices
count, ret := hlml.DeviceCount()
if ret != nil {
Expand All @@ -146,30 +146,30 @@ func (g *GPUHabana) DevicesByID() map[int]interface{} {
return devices
}

func (g *GPUHabana) DevicesByName() map[string]any {
func (g *gpuHabana) DevicesByName() map[string]any {
devices := make(map[string]interface{})
return devices
}

func (g *GPUHabana) DeviceInstances() map[int]map[int]interface{} {
func (g *gpuHabana) DeviceInstances() map[int]map[int]interface{} {
var devices map[int]map[int]interface{}
return devices
}

func (g *GPUHabana) DeviceUtilizationStats(dev any) (map[any]interface{}, error) {
func (g *gpuHabana) DeviceUtilizationStats(dev any) (map[any]interface{}, error) {
ds := make(map[any]interface{}) // Process Accelerator Metrics
return ds, nil
}

func (g *GPUHabana) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]interface{}, error) {
func (g *gpuHabana) ProcessResourceUtilizationPerDevice(dev any, since time.Duration) (map[uint32]interface{}, error) {
pam := make(map[uint32]interface{}) // Process Accelerator Metrics
return pam, nil
}

func (g *GPUHabana) IsDeviceCollectionSupported() bool {
func (g *gpuHabana) IsDeviceCollectionSupported() bool {
return g.collectionSupported
}

func (g *GPUHabana) SetDeviceCollectionSupported(supported bool) {
func (g *gpuHabana) SetDeviceCollectionSupported(supported bool) {
g.collectionSupported = supported
}
Loading

0 comments on commit bba5100

Please sign in to comment.