gpumgr

package
v0.0.0-...-5816aae Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 17, 2025 License: MIT Imports: 21 Imported by: 0

Documentation

Overview

Package gpumgr implements MortalGPUs detection, processes accounting and memory limits enforcement.

Index

Constants

View Source
const GPUMigInstanceIDNoMIG = -1

GPUMigInstanceIDNoMIG constant value to indicate that the device is not using MIG.

View Source
const (
	MB uint64 = 1024 * 1024
)
View Source
const MemoryEnforcerLoopIntervalSec = 5

MemoryEnforcerLoopIntervalSec interval between memory enforcements for processes using MortalGPUs.

Variables

View Source
var ErrBugNoProcessShareSize = errors.New(
	"can not find a GPU process in a container to extract " +
		"GPU ShareSize and compute maximum allowed GPU memory")

ErrBugNoProcessShareSize returned if the enforcer could not extract information about a GPU from any of processes of a GPU container. It is an indication of a potential bug.

View Source
var ErrParseProcessContainerID = errors.New("can not parse a process container ID")

ErrParseProcessContainerID - can not parse/extract container ID for a process' PID from procfs/cgroups.

Functions

This section is empty.

Types

type ContainerDevice

type ContainerDevice struct {
	// contains filtered or unexported fields
}

func (*ContainerDevice) GetAllocatedShares

func (cd *ContainerDevice) GetAllocatedShares() int32

func (*ContainerDevice) GetCopy

func (cd *ContainerDevice) GetCopy() *ContainerDevice

func (*ContainerDevice) GetGPUDevice

func (cd *ContainerDevice) GetGPUDevice() *GpuDevice

type DeviceMemory

type DeviceMemory struct {
	// Total - memory, bytes.
	Total uint64
	// Free - memory, bytes.
	Free uint64
	// Used - memory, bytes.
	Used uint64
	// ShareSize - memory, bytes.
	ShareSize uint64
}

func (*DeviceMemory) MarshalLogObject

func (d *DeviceMemory) MarshalLogObject(enc zapcore.ObjectEncoder) error

type DeviceUtilization

type DeviceUtilization struct {
	Gpu    uint32
	Memory uint32
}

func (*DeviceUtilization) MarshalLogObject

func (d *DeviceUtilization) MarshalLogObject(enc zapcore.ObjectEncoder) error

type GpuContainer

type GpuContainer struct {
	// contains filtered or unexported fields
}

func NewGpuContainer

func NewGpuContainer(log *zap.Logger,
	containerID, containerName, podID, podNamespace, resourceName, nodename string,
	metagpuRequests, metagpuLimits int64,
	deviceIDs []string,
	gpuDevices map[string]*GpuDevice,
) *GpuContainer

func (*GpuContainer) GetContainerId

func (g *GpuContainer) GetContainerId() string

func (*GpuContainer) GetContainerName

func (g *GpuContainer) GetContainerName() string

func (*GpuContainer) GetCopy

func (g *GpuContainer) GetCopy() *GpuContainer

func (*GpuContainer) GetDevices

func (g *GpuContainer) GetDevices() []*ContainerDevice

func (*GpuContainer) GetNodeName

func (g *GpuContainer) GetNodeName() string

func (*GpuContainer) GetPodId

func (g *GpuContainer) GetPodId() string

func (*GpuContainer) GetPodMetaGPULimit

func (g *GpuContainer) GetPodMetaGPULimit() int64

func (*GpuContainer) GetPodMetaGPURequest

func (g *GpuContainer) GetPodMetaGPURequest() int64

func (*GpuContainer) GetPodNamespace

func (g *GpuContainer) GetPodNamespace() string

func (*GpuContainer) GetProcesses

func (g *GpuContainer) GetProcesses() []*GpuProcess

func (*GpuContainer) GetResourceName

func (g *GpuContainer) GetResourceName() string

func (*GpuContainer) MarshalLogObject

func (g *GpuContainer) MarshalLogObject(enc zapcore.ObjectEncoder) error

type GpuDevice

type GpuDevice struct {
	// contains filtered or unexported fields
}

func NewGpuDevice

func NewGpuDevice(logger *zap.Logger,
	miguuid, deviceuuid, modelName string,
	index, miginstanceid int, utilization nvml.Utilization,
) *GpuDevice

func (*GpuDevice) GetAllocatedShares

func (d *GpuDevice) GetAllocatedShares() int

func (*GpuDevice) GetCopy

func (d *GpuDevice) GetCopy() *GpuDevice

func (*GpuDevice) GetDeviceIndex

func (d *GpuDevice) GetDeviceIndex() int

func (*GpuDevice) GetDeviceUUID

func (d *GpuDevice) GetDeviceUUID() string

GetDeviceUUID - Parent device UUID if MIG device, usual device UUID if not MIG mode.

func (*GpuDevice) GetMemory

func (d *GpuDevice) GetMemory() DeviceMemory

func (*GpuDevice) GetMigId

func (d *GpuDevice) GetMigId() int

func (*GpuDevice) GetModelName

func (d *GpuDevice) GetModelName() string

GetModelName - a human readable device name from NVIDIA API.

func (*GpuDevice) GetNodeName

func (d *GpuDevice) GetNodeName() string

func (*GpuDevice) GetResourceName

func (d *GpuDevice) GetResourceName() string

func (*GpuDevice) GetShares

func (d *GpuDevice) GetShares() int

func (*GpuDevice) GetUUID

func (d *GpuDevice) GetUUID() string

GetUUID - MIG device UUID, if MIG mode, otherwise a normal device UUID.

func (*GpuDevice) GetUtilization

func (d *GpuDevice) GetUtilization() DeviceUtilization

func (*GpuDevice) IsMigDevice

func (d *GpuDevice) IsMigDevice() bool

func (*GpuDevice) MarshalLogObject

func (d *GpuDevice) MarshalLogObject(enc zapcore.ObjectEncoder) error

func (*GpuDevice) SetAllocatedShares

func (d *GpuDevice) SetAllocatedShares(s int)

SetAllocatedShares sets allocated shares.

func (*GpuDevice) SetUUID

func (d *GpuDevice) SetUUID(u string)

SetUUID set MIG device UUID, if MIG mode, otherwise a normal device UUID.

type GpuDeviceInfo

type GpuDeviceInfo struct {
	Node     string
	Metadata map[string]string
	Devices  []*GpuDevice
}

type GpuMgr

type GpuMgr struct {
	// contains filtered or unexported fields
}

func NewGpuManager

func NewGpuManager(logger *zap.Logger, enforceMem bool, nodeName string, shareConf *sharecfg.DevicesSharingConfigs) *GpuMgr

NewGpuManager creates a new GPU manager. nodeName - a Kubernetes worker hostname as it is registered in Kube API to filter Pods. enforceMem - if true, the manager will kill processes which use more GPU memory than allowed.

func (*GpuMgr) GetDeviceInfo

func (m *GpuMgr) GetDeviceInfo() GpuDeviceInfo

func (*GpuMgr) GetProcesses

func (m *GpuMgr) GetProcesses(podName, podNamespace string) map[string][]*GpuContainer

func (*GpuMgr) KillGpuProcess

func (m *GpuMgr) KillGpuProcess(pid uint32) error

func (*GpuMgr) SetContainerLevelVisibilityToken

func (m *GpuMgr) SetContainerLevelVisibilityToken(token string)

func (*GpuMgr) SetDeviceLevelVisibilityToken

func (m *GpuMgr) SetDeviceLevelVisibilityToken(token string)

func (*GpuMgr) StartMemoryEnforcer

func (m *GpuMgr) StartMemoryEnforcer()

StartMemoryEnforcer starts a loop which checks all processes using MortalGPUs on a node and kills processes if total memory usage of the processes of a container is larger than the limits.

type GpuProcess

type GpuProcess struct {
	// contains filtered or unexported fields
}

func NewGpuProcess

func NewGpuProcess(logger *zap.Logger, pid,
	gpuUtil uint32, gpuMemBytes uint64,
	mgpuUUID, devUUID string,
	migInstanceID int, computeInstanceID, gpuInstanceID int32,
) *GpuProcess

func (*GpuProcess) GetCMDLine

func (p *GpuProcess) GetCMDLine() []string

func (*GpuProcess) GetComputeInstanceId

func (p *GpuProcess) GetComputeInstanceId() int32

func (*GpuProcess) GetContainerId

func (p *GpuProcess) GetContainerId() string

func (*GpuProcess) GetCopy

func (p *GpuProcess) GetCopy() *GpuProcess

func (*GpuProcess) GetDeviceUUID

func (p *GpuProcess) GetDeviceUUID() string

func (*GpuProcess) GetGPUInstanceId

func (p *GpuProcess) GetGPUInstanceId() int32

func (*GpuProcess) GetGPUMemory

func (p *GpuProcess) GetGPUMemory() uint64

func (*GpuProcess) GetGPUUtilization

func (p *GpuProcess) GetGPUUtilization() uint32

func (*GpuProcess) GetMetaGpuUUID

func (p *GpuProcess) GetMetaGpuUUID() string

func (*GpuProcess) GetMigInstanceId

func (p *GpuProcess) GetMigInstanceId() int

func (*GpuProcess) GetPid

func (p *GpuProcess) GetPid() uint32

func (*GpuProcess) GetShortCmdLine

func (p *GpuProcess) GetShortCmdLine() string

func (*GpuProcess) GetUser

func (p *GpuProcess) GetUser() string

func (*GpuProcess) Kill

func (p *GpuProcess) Kill() error

func (*GpuProcess) MarshalLogObject

func (p *GpuProcess) MarshalLogObject(enc zapcore.ObjectEncoder) error

MarshalLogObject implements zap.ObjectLogMarshaler.

func (*GpuProcess) SetProcessCmdline

func (p *GpuProcess) SetProcessCmdline()

func (*GpuProcess) SetProcessContainerId

func (p *GpuProcess) SetProcessContainerId()

func (*GpuProcess) SetProcessUsername

func (p *GpuProcess) SetProcessUsername()

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL
JackTT - Gopher 🇻🇳