Documentation
¶
Overview ¶
Package gpumgr implements MortalGPUs detection, processes accounting and memory limits enforcement.
Index ¶
- Constants
- Variables
- type ContainerDevice
- type DeviceMemory
- type DeviceUtilization
- type GpuContainer
- func (g *GpuContainer) GetContainerId() string
- func (g *GpuContainer) GetContainerName() string
- func (g *GpuContainer) GetCopy() *GpuContainer
- func (g *GpuContainer) GetDevices() []*ContainerDevice
- func (g *GpuContainer) GetNodeName() string
- func (g *GpuContainer) GetPodId() string
- func (g *GpuContainer) GetPodMetaGPULimit() int64
- func (g *GpuContainer) GetPodMetaGPURequest() int64
- func (g *GpuContainer) GetPodNamespace() string
- func (g *GpuContainer) GetProcesses() []*GpuProcess
- func (g *GpuContainer) GetResourceName() string
- func (g *GpuContainer) MarshalLogObject(enc zapcore.ObjectEncoder) error
- type GpuDevice
- func (d *GpuDevice) GetAllocatedShares() int
- func (d *GpuDevice) GetCopy() *GpuDevice
- func (d *GpuDevice) GetDeviceIndex() int
- func (d *GpuDevice) GetDeviceUUID() string
- func (d *GpuDevice) GetMemory() DeviceMemory
- func (d *GpuDevice) GetMigId() int
- func (d *GpuDevice) GetModelName() string
- func (d *GpuDevice) GetNodeName() string
- func (d *GpuDevice) GetResourceName() string
- func (d *GpuDevice) GetShares() int
- func (d *GpuDevice) GetUUID() string
- func (d *GpuDevice) GetUtilization() DeviceUtilization
- func (d *GpuDevice) IsMigDevice() bool
- func (d *GpuDevice) MarshalLogObject(enc zapcore.ObjectEncoder) error
- func (d *GpuDevice) SetAllocatedShares(s int)
- func (d *GpuDevice) SetUUID(u string)
- type GpuDeviceInfo
- type GpuMgr
- func (m *GpuMgr) GetDeviceInfo() GpuDeviceInfo
- func (m *GpuMgr) GetProcesses(podName, podNamespace string) map[string][]*GpuContainer
- func (m *GpuMgr) KillGpuProcess(pid uint32) error
- func (m *GpuMgr) SetContainerLevelVisibilityToken(token string)
- func (m *GpuMgr) SetDeviceLevelVisibilityToken(token string)
- func (m *GpuMgr) StartMemoryEnforcer()
- type GpuProcess
- func (p *GpuProcess) GetCMDLine() []string
- func (p *GpuProcess) GetComputeInstanceId() int32
- func (p *GpuProcess) GetContainerId() string
- func (p *GpuProcess) GetCopy() *GpuProcess
- func (p *GpuProcess) GetDeviceUUID() string
- func (p *GpuProcess) GetGPUInstanceId() int32
- func (p *GpuProcess) GetGPUMemory() uint64
- func (p *GpuProcess) GetGPUUtilization() uint32
- func (p *GpuProcess) GetMetaGpuUUID() string
- func (p *GpuProcess) GetMigInstanceId() int
- func (p *GpuProcess) GetPid() uint32
- func (p *GpuProcess) GetShortCmdLine() string
- func (p *GpuProcess) GetUser() string
- func (p *GpuProcess) Kill() error
- func (p *GpuProcess) MarshalLogObject(enc zapcore.ObjectEncoder) error
- func (p *GpuProcess) SetProcessCmdline()
- func (p *GpuProcess) SetProcessContainerId()
- func (p *GpuProcess) SetProcessUsername()
Constants ¶
const GPUMigInstanceIDNoMIG = -1
GPUMigInstanceIDNoMIG constant value to indicate that the device is not using MIG.
const (
MB uint64 = 1024 * 1024
)
const MemoryEnforcerLoopIntervalSec = 5
MemoryEnforcerLoopIntervalSec interval between memory enforcements for processes using MortalGPUs.
Variables ¶
var ErrBugNoProcessShareSize = errors.New(
"can not find a GPU process in a container to extract " +
"GPU ShareSize and compute maximum allowed GPU memory")
ErrBugNoProcessShareSize returned if the enforcer could not extract information about a GPU from any of processes of a GPU container. It is an indication of a potential bug.
var ErrParseProcessContainerID = errors.New("can not parse a process container ID")
ErrParseProcessContainerID - can not parse/extract container ID for a process' PID from procfs/cgroups.
Functions ¶
This section is empty.
Types ¶
type ContainerDevice ¶
type ContainerDevice struct {
// contains filtered or unexported fields
}
func (*ContainerDevice) GetAllocatedShares ¶
func (cd *ContainerDevice) GetAllocatedShares() int32
func (*ContainerDevice) GetGPUDevice ¶
func (cd *ContainerDevice) GetGPUDevice() *GpuDevice
type DeviceMemory ¶
type DeviceMemory struct {
// Total - memory, bytes.
Total uint64
// Free - memory, bytes.
Free uint64
// Used - memory, bytes.
Used uint64
// ShareSize - memory, bytes.
ShareSize uint64
}
func (*DeviceMemory) MarshalLogObject ¶
func (d *DeviceMemory) MarshalLogObject(enc zapcore.ObjectEncoder) error
type DeviceUtilization ¶
type DeviceUtilization struct {
Gpu uint32
Memory uint32
}
func (*DeviceUtilization) MarshalLogObject ¶
func (d *DeviceUtilization) MarshalLogObject(enc zapcore.ObjectEncoder) error
type GpuContainer ¶
type GpuContainer struct {
// contains filtered or unexported fields
}
func NewGpuContainer ¶
func NewGpuContainer(log *zap.Logger,
containerID, containerName, podID, podNamespace, resourceName, nodename string,
metagpuRequests, metagpuLimits int64,
deviceIDs []string,
gpuDevices map[string]*GpuDevice,
) *GpuContainer
func (*GpuContainer) GetContainerId ¶
func (g *GpuContainer) GetContainerId() string
func (*GpuContainer) GetContainerName ¶
func (g *GpuContainer) GetContainerName() string
func (*GpuContainer) GetDevices ¶
func (g *GpuContainer) GetDevices() []*ContainerDevice
func (*GpuContainer) GetNodeName ¶
func (g *GpuContainer) GetNodeName() string
func (*GpuContainer) GetPodMetaGPULimit ¶
func (g *GpuContainer) GetPodMetaGPULimit() int64
func (*GpuContainer) GetPodMetaGPURequest ¶
func (g *GpuContainer) GetPodMetaGPURequest() int64
func (*GpuContainer) GetPodNamespace ¶
func (g *GpuContainer) GetPodNamespace() string
func (*GpuContainer) GetProcesses ¶
func (g *GpuContainer) GetProcesses() []*GpuProcess
func (*GpuContainer) GetResourceName ¶
func (g *GpuContainer) GetResourceName() string
func (*GpuContainer) MarshalLogObject ¶
func (g *GpuContainer) MarshalLogObject(enc zapcore.ObjectEncoder) error
type GpuDevice ¶
type GpuDevice struct {
// contains filtered or unexported fields
}
func NewGpuDevice ¶
func NewGpuDevice(logger *zap.Logger,
miguuid, deviceuuid, modelName string,
index, miginstanceid int, utilization nvml.Utilization,
) *GpuDevice
func (*GpuDevice) GetAllocatedShares ¶
func (d *GpuDevice) GetAllocatedShares() int
func (*GpuDevice) GetDeviceIndex ¶
func (d *GpuDevice) GetDeviceIndex() int
func (*GpuDevice) GetDeviceUUID ¶
func (d *GpuDevice) GetDeviceUUID() string
GetDeviceUUID - Parent device UUID if MIG device, usual device UUID if not MIG mode.
func (*GpuDevice) GetModelName ¶
func (d *GpuDevice) GetModelName() string
GetModelName - a human readable device name from NVIDIA API.
func (*GpuDevice) GetNodeName ¶
func (d *GpuDevice) GetNodeName() string
func (*GpuDevice) GetResourceName ¶
func (d *GpuDevice) GetResourceName() string
func (*GpuDevice) GetUUID ¶
func (d *GpuDevice) GetUUID() string
GetUUID - MIG device UUID, if MIG mode, otherwise a normal device UUID.
func (*GpuDevice) GetUtilization ¶
func (d *GpuDevice) GetUtilization() DeviceUtilization
func (*GpuDevice) IsMigDevice ¶
func (d *GpuDevice) IsMigDevice() bool
func (*GpuDevice) MarshalLogObject ¶
func (d *GpuDevice) MarshalLogObject(enc zapcore.ObjectEncoder) error
func (*GpuDevice) SetAllocatedShares ¶
func (d *GpuDevice) SetAllocatedShares(s int)
SetAllocatedShares sets allocated shares.
type GpuDeviceInfo ¶
type GpuDeviceInfo struct {
Node string
Metadata map[string]string
Devices []*GpuDevice
}
type GpuMgr ¶
type GpuMgr struct {
// contains filtered or unexported fields
}
func NewGpuManager ¶
func NewGpuManager(logger *zap.Logger, enforceMem bool, nodeName string, shareConf *sharecfg.DevicesSharingConfigs) *GpuMgr
NewGpuManager creates a new GPU manager. nodeName - a Kubernetes worker hostname as it is registered in Kube API to filter Pods. enforceMem - if true, the manager will kill processes which use more GPU memory than allowed.
func (*GpuMgr) GetDeviceInfo ¶
func (m *GpuMgr) GetDeviceInfo() GpuDeviceInfo
func (*GpuMgr) GetProcesses ¶
func (m *GpuMgr) GetProcesses(podName, podNamespace string) map[string][]*GpuContainer
func (*GpuMgr) KillGpuProcess ¶
func (m *GpuMgr) KillGpuProcess(pid uint32) error
func (*GpuMgr) SetContainerLevelVisibilityToken ¶
func (m *GpuMgr) SetContainerLevelVisibilityToken(token string)
func (*GpuMgr) SetDeviceLevelVisibilityToken ¶
func (m *GpuMgr) SetDeviceLevelVisibilityToken(token string)
func (*GpuMgr) StartMemoryEnforcer ¶
func (m *GpuMgr) StartMemoryEnforcer()
StartMemoryEnforcer starts a loop which checks all processes using MortalGPUs on a node and kills processes if total memory usage of the processes of a container is larger than the limits.
type GpuProcess ¶
type GpuProcess struct {
// contains filtered or unexported fields
}
func NewGpuProcess ¶
func NewGpuProcess(logger *zap.Logger, pid,
gpuUtil uint32, gpuMemBytes uint64,
mgpuUUID, devUUID string,
migInstanceID int, computeInstanceID, gpuInstanceID int32,
) *GpuProcess
func (*GpuProcess) GetCMDLine ¶
func (p *GpuProcess) GetCMDLine() []string
func (*GpuProcess) GetComputeInstanceId ¶
func (p *GpuProcess) GetComputeInstanceId() int32
func (*GpuProcess) GetContainerId ¶
func (p *GpuProcess) GetContainerId() string
func (*GpuProcess) GetDeviceUUID ¶
func (p *GpuProcess) GetDeviceUUID() string
func (*GpuProcess) GetGPUInstanceId ¶
func (p *GpuProcess) GetGPUInstanceId() int32
func (*GpuProcess) GetGPUMemory ¶
func (p *GpuProcess) GetGPUMemory() uint64
func (*GpuProcess) GetGPUUtilization ¶
func (p *GpuProcess) GetGPUUtilization() uint32
func (*GpuProcess) GetMetaGpuUUID ¶
func (p *GpuProcess) GetMetaGpuUUID() string
func (*GpuProcess) GetMigInstanceId ¶
func (p *GpuProcess) GetMigInstanceId() int
func (*GpuProcess) GetShortCmdLine ¶
func (p *GpuProcess) GetShortCmdLine() string
func (*GpuProcess) MarshalLogObject ¶
func (p *GpuProcess) MarshalLogObject(enc zapcore.ObjectEncoder) error
MarshalLogObject implements zap.ObjectLogMarshaler.
func (*GpuProcess) SetProcessCmdline ¶
func (p *GpuProcess) SetProcessCmdline()
func (*GpuProcess) SetProcessContainerId ¶
func (p *GpuProcess) SetProcessContainerId()
func (*GpuProcess) SetProcessUsername ¶
func (p *GpuProcess) SetProcessUsername()