Documentation
¶
Index ¶
- Constants
- Variables
- func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error)
- func AddToGroup(groupId GroupHandle, gpuId uint) (err error)
- func DestroyGroup(groupId GroupHandle) (err error)
- func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
- func FieldsInit() int
- func FieldsTerm() int
- func FindFirstNonAsciiIndex(value [4096]byte) int
- func Fv2_Blob(fv FieldValue_v2) [4096]byte
- func Fv2_Float64(fv FieldValue_v2) float64
- func Fv2_Int64(fv FieldValue_v2) int64
- func Fv2_String(fv FieldValue_v2) string
- func GetAllDeviceCount() (uint, error)
- func GetSupportedDevices() ([]uint, error)
- func Init(m mode, args ...string) (cleanup func(), err error)
- func IsInt32Blank(value int) bool
- func IsInt64Blank(value int64) bool
- func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
- func Shutdown() (err error)
- func UpdateAllFields() error
- func ViolationRegistration(data unsafe.Pointer) int
- func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
- func WatchFieldsWithGroupEx(fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, ...) error
- type ClockInfo
- type DcgmStatus
- type Device
- type DeviceHealth
- type DeviceIdentifiers
- type DeviceStatus
- type ECCErrorsInfo
- type FieldHandle
- type FieldMeta
- type FieldValue_v1
- type FieldValue_v2
- type Field_Entity_Group
- type GroupEntityPair
- type GroupHandle
- func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
- func NewDefaultGroup(groupName string) (GroupHandle, error)
- func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
- func WatchPidFields() (GroupHandle, error)
- func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
- type MemoryInfo
- type MetricGroup
- type MigEntityInfo
- type MigHierarchyInfo_v2
- type MigHierarchy_v2
- type P2PLink
- type P2PLinkType
- type PCIInfo
- type PCIStatusInfo
- type PCIThroughputInfo
- type PerfState
- type PolicyViolation
- type ProcessInfo
- type ProcessUtilInfo
- type Short
- type SystemWatch
- type Time
- type UtilizationInfo
- type ViolationTime
- type XIDErrorInfo
Constants ¶
const ( Embedded mode = iota Standalone StartHostengine )
const for DCGM hostengine running modes: Embedded, Standalone or StartHostengine
const ( DCGM_FT_BINARY = uint('b') DCGM_FT_DOUBLE = uint('d') DCGM_FT_INT64 = uint('i') DCGM_FT_STRING = uint('s') DCGM_FT_TIMESTAMP = uint('t') DCGM_FT_INT32_BLANK = int64(2147483632) DCGM_FT_INT32_NOT_FOUND = int64(DCGM_FT_INT32_BLANK + 1) DCGM_FT_INT32_NOT_SUPPORTED = int64(DCGM_FT_INT32_BLANK + 2) DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3) DCGM_FT_INT64_BLANK = int64(9223372036854775792) DCGM_FT_INT64_NOT_FOUND = int64(DCGM_FT_INT64_BLANK + 1) DCGM_FT_INT64_NOT_SUPPORTED = int64(DCGM_FT_INT64_BLANK + 2) DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3) DCGM_FT_FP64_BLANK = 140737488355328.0 DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) DCGM_FT_STR_BLANK = "<<<NULL>>>" DCGM_FT_STR_NOT_FOUND = "<<<NOT_FOUND>>>" DCGM_FT_STR_NOT_SUPPORTED = "<<<NOT_SUPPORTED>>>" DCGM_FT_STR_NOT_PERMISSIONED = "<<<NOT_PERM>>>" DCGM_FI_UNKNOWN = 0 DCGM_FI_DRIVER_VERSION = 1 DCGM_FI_NVML_VERSION = 2 DCGM_FI_PROCESS_NAME = 3 DCGM_FI_DEV_COUNT = 4 DCGM_FI_DEV_NAME = 50 DCGM_FI_DEV_BRAND = 51 DCGM_FI_DEV_NVML_INDEX = 52 DCGM_FI_DEV_SERIAL = 53 DCGM_FI_DEV_UUID = 54 DCGM_FI_DEV_MINOR_NUMBER = 55 DCGM_FI_DEV_OEM_INFOROM_VER = 56 DCGM_FI_DEV_PCI_BUSID = 57 DCGM_FI_DEV_PCI_COMBINED_ID = 58 DCGM_FI_DEV_PCI_SUBSYS_ID = 59 DCGM_FI_GPU_TOPOLOGY_PCI = 60 DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 DCGM_FI_DEV_COMPUTE_MODE = 65 DCGM_FI_DEV_CPU_AFFINITY_0 = 70 DCGM_FI_DEV_CPU_AFFINITY_1 = 71 DCGM_FI_DEV_CPU_AFFINITY_2 = 72 DCGM_FI_DEV_CPU_AFFINITY_3 = 73 DCGM_FI_DEV_ECC_INFOROM_VER = 80 DCGM_FI_DEV_POWER_INFOROM_VER = 81 DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 DCGM_FI_DEV_VBIOS_VERSION = 85 DCGM_FI_DEV_BAR1_TOTAL = 90 DCGM_FI_SYNC_BOOST = 91 DCGM_FI_DEV_BAR1_USED = 92 DCGM_FI_DEV_BAR1_FREE = 93 DCGM_FI_DEV_SM_CLOCK = 100 DCGM_FI_DEV_MEM_CLOCK = 101 DCGM_FI_DEV_VIDEO_CLOCK = 102 DCGM_FI_DEV_APP_SM_CLOCK = 110 DCGM_FI_DEV_APP_MEM_CLOCK = 111 DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 DCGM_FI_DEV_MAX_SM_CLOCK = 113 DCGM_FI_DEV_MAX_MEM_CLOCK = 114 DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 DCGM_FI_DEV_AUTOBOOST = 120 DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 DCGM_FI_DEV_MEMORY_TEMP = 140 DCGM_FI_DEV_GPU_TEMP = 150 DCGM_FI_DEV_POWER_USAGE = 155 DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 DCGM_FI_DEV_SLOWDOWN_TEMP = 158 DCGM_FI_DEV_SHUTDOWN_TEMP = 159 DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 DCGM_FI_DEV_PSTATE = 190 DCGM_FI_DEV_FAN_SPEED = 191 DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 DCGM_FI_DEV_GPU_UTIL = 203 DCGM_FI_DEV_MEM_COPY_UTIL = 204 DCGM_FI_DEV_ACCOUNTING_DATA = 205 DCGM_FI_DEV_ENC_UTIL = 206 DCGM_FI_DEV_DEC_UTIL = 207 DCGM_FI_DEV_MEM_COPY_UTIL_SAMPLES = 210 DCGM_FI_DEV_GPU_UTIL_SAMPLES = 211 DCGM_FI_DEV_GRAPHICS_PIDS = 220 DCGM_FI_DEV_COMPUTE_PIDS = 221 DCGM_FI_DEV_XID_ERRORS = 230 DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 DCGM_FI_DEV_PCIE_LINK_GEN = 237 DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 DCGM_FI_DEV_POWER_VIOLATION = 240 DCGM_FI_DEV_THERMAL_VIOLATION = 241 DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 DCGM_FI_DEV_FB_TOTAL = 250 DCGM_FI_DEV_FB_FREE = 251 DCGM_FI_DEV_FB_USED = 252 DCGM_FI_DEV_ECC_CURRENT = 300 DCGM_FI_DEV_ECC_PENDING = 301 DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 DCGM_FI_DEV_RETIRED_SBE = 390 DCGM_FI_DEV_RETIRED_DBE = 391 DCGM_FI_DEV_RETIRED_PENDING = 392 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 DCGM_FI_DEV_VIRTUAL_MODE = 500 DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 DCGM_FI_DEV_ENC_STATS = 506 DCGM_FI_DEV_FBC_STATS = 507 DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 DCGM_FI_DEV_VGPU_VM_ID = 520 DCGM_FI_DEV_VGPU_VM_NAME = 521 DCGM_FI_DEV_VGPU_TYPE = 522 DCGM_FI_DEV_VGPU_UUID = 523 DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 DCGM_FI_DEV_VGPU_ENC_STATS = 528 DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 DCGM_FI_DEV_VGPU_FBC_STATS = 530 DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 DCGM_FI_FIRST_VGPU_FIELD_ID = 520 DCGM_FI_LAST_VGPU_FIELD_ID = 570 DCGM_FI_INTERNAL_FIELDS_0_START = 600 DCGM_FI_INTERNAL_FIELDS_0_END = 699 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P00 = 700 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P00 = 701 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P00 = 702 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P00 = 703 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P01 = 704 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P01 = 705 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P01 = 706 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P01 = 707 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P02 = 708 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P02 = 709 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P02 = 710 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P02 = 711 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P03 = 712 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P03 = 713 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P03 = 714 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P03 = 715 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P04 = 716 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P04 = 717 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P04 = 718 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P04 = 719 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P05 = 720 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P05 = 721 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P05 = 722 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P05 = 723 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P06 = 724 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P06 = 725 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P06 = 726 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P06 = 727 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P07 = 728 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P07 = 729 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P07 = 730 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P07 = 731 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P08 = 732 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P08 = 733 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P08 = 734 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P08 = 735 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P09 = 736 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P09 = 737 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P09 = 738 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P09 = 739 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P10 = 740 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P10 = 741 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P10 = 742 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P10 = 743 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P11 = 744 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P11 = 745 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P11 = 746 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P11 = 747 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P12 = 748 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P12 = 749 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P12 = 750 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P12 = 751 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P13 = 752 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P13 = 753 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P13 = 754 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P13 = 755 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P14 = 756 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P14 = 757 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P14 = 758 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P14 = 759 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P15 = 760 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P15 = 761 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P15 = 762 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P15 = 763 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P16 = 764 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P16 = 765 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P16 = 766 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P16 = 767 DCGM_FI_DEV_NVSWITCH_LATENCY_LOW_P17 = 768 DCGM_FI_DEV_NVSWITCH_LATENCY_MED_P17 = 769 DCGM_FI_DEV_NVSWITCH_LATENCY_HIGH_P17 = 770 DCGM_FI_DEV_NVSWITCH_LATENCY_MAX_P17 = 771 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P00 = 780 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P00 = 781 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P01 = 782 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P01 = 783 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P02 = 784 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P02 = 785 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P03 = 786 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P03 = 787 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P04 = 788 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P04 = 789 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P05 = 790 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P05 = 791 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P06 = 792 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P06 = 793 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P07 = 794 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P07 = 795 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P08 = 796 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P08 = 797 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P09 = 798 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P09 = 799 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P10 = 800 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P10 = 801 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P11 = 802 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P11 = 803 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P12 = 804 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P12 = 805 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P13 = 806 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P13 = 807 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P14 = 808 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P14 = 809 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P15 = 810 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P15 = 811 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P16 = 812 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P16 = 813 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_0_P17 = 814 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_0_P17 = 815 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P00 = 820 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P00 = 821 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P01 = 822 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P01 = 823 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P02 = 824 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P02 = 825 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P03 = 826 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P03 = 827 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P04 = 828 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P04 = 829 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P05 = 830 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P05 = 831 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P06 = 832 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P06 = 833 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P07 = 834 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P07 = 835 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P08 = 836 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P08 = 837 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P09 = 838 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P09 = 839 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P10 = 840 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P10 = 841 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P11 = 842 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P11 = 843 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P12 = 844 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P12 = 845 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P13 = 846 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P13 = 847 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P14 = 848 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P14 = 849 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P15 = 850 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P15 = 851 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P16 = 852 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P16 = 853 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_TX_1_P17 = 854 DCGM_FI_DEV_NVSWITCH_BANDWIDTH_RX_1_P17 = 855 DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 DCGM_FI_FIRST_NVSWITCH_FIELD_ID = 700 DCGM_FI_LAST_NVSWITCH_FIELD_ID = 860 DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 DCGM_FI_PROF_SM_ACTIVE = 1002 DCGM_FI_PROF_SM_OCCUPANCY = 1003 DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 DCGM_FI_PROF_DRAM_ACTIVE = 1005 DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 DCGM_FI_PROF_PCIE_TX_BYTES = 1009 DCGM_FI_PROF_PCIE_RX_BYTES = 1010 DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 DCGM_FI_MAX_FIELDS = 1013 )
const ( PerfStateMax = 0 PerfStateMin = 15 PerfStateUnknown = 32 )
const ( MAX_NUM_DEVICES uint = C.DCGM_MAX_NUM_DEVICES MAX_HIERARCHY_INFO uint = C.DCGM_MAX_HIERARCHY_INFO )
const ( DbePolicy = policyCondition("Double-bit ECC error") PCIePolicy = policyCondition("PCI error") MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") ThermalPolicy = policyCondition("Thermal Limit") PowerPolicy = policyCondition("Power Limit") NvlinkPolicy = policyCondition("Nvlink Error") XidPolicy = policyCondition("XID Error") )
const (
DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001)
)
Variables ¶
var (
DCGM_FI = map[string]Short{}/* 345 elements not displayed */
)
var (
OLD_DCGM_FI = map[string]Short{
"dcgm_sm_clock": 100,
"dcgm_memory_clock": 101,
"dcgm_memory_temp": 140,
"dcgm_gpu_temp": 150,
"dcgm_power_usage": 155,
"dcgm_total_energy_consumption": 156,
"dcgm_pcie_tx_throughput": 200,
"dcgm_pcie_rx_throughput": 201,
"dcgm_pcie_replay_counter": 202,
"dcgm_gpu_utilization": 203,
"dcgm_mem_copy_utilization": 204,
"dcgm_enc_utilization": 206,
"dcgm_dec_utilization": 207,
"dcgm_xid_errors": 230,
"dcgm_power_violation": 240,
"dcgm_thermal_violation": 241,
"dcgm_sync_boost_violation": 242,
"dcgm_board_limit_violation": 243,
"dcgm_low_util_violation": 244,
"dcgm_reliability_violation": 245,
"dcgm_fb_free": 251,
"dcgm_fb_used": 252,
"dcgm_ecc_sbe_volatile_total": 310,
"dcgm_ecc_dbe_volatile_total": 311,
"dcgm_ecc_sbe_aggregate_total": 312,
"dcgm_ecc_dbe_aggregate_total": 313,
"dcgm_retired_pages_sbe": 390,
"dcgm_retired_pages_dbe": 391,
"dcgm_retired_pages_pending": 392,
"dcgm_nvlink_flit_crc_error_count_total": 409,
"dcgm_nvlink_data_crc_error_count_total": 419,
"dcgm_nvlink_replay_error_count_total": 429,
"dcgm_nvlink_recovery_error_count_total": 439,
"dcgm_nvlink_bandwidth_total": 449,
"dcgm_fi_prof_gr_engine_active": 1001,
"dcgm_fi_prof_sm_active": 1002,
"dcgm_fi_prof_sm_occupancy": 1003,
"dcgm_fi_prof_pipe_tensor_active": 1004,
"dcgm_fi_prof_dram_active": 1005,
"dcgm_fi_prof_pcie_tx_bytes": 1009,
"dcgm_fi_prof_pcie_rx_bytes": 1010,
}
)
Functions ¶
func AddEntityToGroup ¶
func AddEntityToGroup(groupId GroupHandle, entityGroupId Field_Entity_Group, entityId uint) (err error)
func AddToGroup ¶
func AddToGroup(groupId GroupHandle, gpuId uint) (err error)
func DestroyGroup ¶
func DestroyGroup(groupId GroupHandle) (err error)
func FieldGroupDestroy ¶
func FieldGroupDestroy(fieldsGroup FieldHandle) (err error)
func FieldsInit ¶
func FieldsInit() int
func FieldsTerm ¶
func FieldsTerm() int
func FindFirstNonAsciiIndex ¶
func Fv2_Blob ¶
func Fv2_Blob(fv FieldValue_v2) [4096]byte
func Fv2_Float64 ¶
func Fv2_Float64(fv FieldValue_v2) float64
func Fv2_Int64 ¶
func Fv2_Int64(fv FieldValue_v2) int64
func Fv2_String ¶
func Fv2_String(fv FieldValue_v2) string
func GetAllDeviceCount ¶
GetAllDeviceCount counts all GPUs on the system
func GetSupportedDevices ¶
GetSupportedDevices returns only DCGM supported GPUs
func Init ¶
Init starts DCGM, based on the user selected mode DCGM can be started in 3 differengt modes: 1. Embedded: Start hostengine within this process 2. Standalone: Connect to an already running nv-hostengine at the specified address Connection address can be passed as command line args: -connect "IP:PORT/Socket" -socket "isSocket" 3. StartHostengine: Open an Unix socket to start and connect to the nv-hostengine and terminate before exiting
func IsInt32Blank ¶
func IsInt64Blank ¶
func Policy ¶
func Policy(gpuId uint, typ ...policyCondition) (<-chan PolicyViolation, error)
Policy sets GPU usage and error policies and notifies in case of any violations via callback functions
func UpdateAllFields ¶
func UpdateAllFields() error
func ViolationRegistration ¶
ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()
func WatchFieldsWithGroup ¶
func WatchFieldsWithGroup(fieldsGroup FieldHandle, group GroupHandle) error
func WatchFieldsWithGroupEx ¶
func WatchFieldsWithGroupEx(fieldsGroup FieldHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error
Types ¶
type DcgmStatus ¶
func Introspect ¶
func Introspect() (DcgmStatus, error)
Introspect returns DCGM hostengine memory and CPU usage
type Device ¶
type Device struct { GPU uint DCGMSupported string UUID string Power uint // W PCI PCIInfo Identifiers DeviceIdentifiers Topology []P2PLink CPUAffinity string }
func GetDeviceInfo ¶
GetDeviceInfo describes the given device
type DeviceHealth ¶
type DeviceHealth struct { GPU uint Status string Watches []SystemWatch }
func HealthCheckByGpuId ¶
func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error)
HealthCheckByGpuId monitors GPU health for any errors/failures/warnings
type DeviceIdentifiers ¶
type DeviceStatus ¶
type DeviceStatus struct { Power float64 // W Temperature int64 // °C Utilization UtilizationInfo Memory MemoryInfo Clocks ClockInfo PCI PCIStatusInfo Performance PerfState FanSpeed int64 // % }
func GetDeviceStatus ¶
func GetDeviceStatus(gpuId uint) (DeviceStatus, error)
GetDeviceStatus monitors GPU status including its power, memory and GPU utilization
type ECCErrorsInfo ¶
type FieldHandle ¶
type FieldHandle struct {
// contains filtered or unexported fields
}
func FieldGroupCreate ¶
func FieldGroupCreate(fieldsGroupName string, fields []Short) (fieldsId FieldHandle, err error)
type FieldMeta ¶
type FieldMeta struct { FieldId Short FieldType byte Size byte Tag string Scope int NvmlFieldId int EntityLevel Field_Entity_Group }
func FieldGetById ¶
func ToFieldMeta ¶
func ToFieldMeta(fieldInfo C.dcgm_field_meta_p) FieldMeta
type FieldValue_v1 ¶
type FieldValue_v1 struct { Version uint FieldId uint FieldType uint Status int Ts int64 Value [4096]byte }
func EntityGetLatestValues ¶
func EntityGetLatestValues(entityGroup Field_Entity_Group, entityId uint, fields []Short) ([]FieldValue_v1, error)
func GetLatestValuesForFields ¶
func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error)
func (FieldValue_v1) Blob ¶
func (fv FieldValue_v1) Blob() [4096]byte
func (FieldValue_v1) Float64 ¶
func (fv FieldValue_v1) Float64() float64
func (FieldValue_v1) Int64 ¶
func (fv FieldValue_v1) Int64() int64
func (FieldValue_v1) String ¶
func (fv FieldValue_v1) String() string
type FieldValue_v2 ¶
type FieldValue_v2 struct { Version uint EntityGroupId Field_Entity_Group EntityId uint FieldId uint FieldType uint Status int Ts int64 Value [4096]byte StringValue *string }
func EntitiesGetLatestValues ¶
func EntitiesGetLatestValues(entities []GroupEntityPair, fields []Short, flags uint) ([]FieldValue_v2, error)
type Field_Entity_Group ¶
type Field_Entity_Group uint
const ( FE_NONE Field_Entity_Group = iota FE_GPU FE_VGPU FE_SWITCH FE_GPU_I FE_GPU_CI FE_COUNT )
type GroupEntityPair ¶
type GroupEntityPair struct { EntityGroupId Field_Entity_Group EntityId uint }
type GroupHandle ¶
type GroupHandle struct {
// contains filtered or unexported fields
}
func CreateGroup ¶
func CreateGroup(groupName string) (goGroupId GroupHandle, err error)
func NewDefaultGroup ¶
func NewDefaultGroup(groupName string) (GroupHandle, error)
func WatchFields ¶
func WatchFields(gpuId uint, fieldsGroup FieldHandle, groupName string) (groupId GroupHandle, err error)
func WatchPidFields ¶
func WatchPidFields() (GroupHandle, error)
WatchPidFields lets DCGM start recording stats for GPU process It needs to be called before calling GetProcessInfo
func WatchPidFieldsEx ¶
func WatchPidFieldsEx(updateFreq, maxKeepAge time.Duration, maxKeepSamples int, gpus ...uint) (GroupHandle, error)
WatchPidFieldsEx is the same as WatchPidFields, but allows for modifying the update frequency, max samples, max sample age, and the GPUs on which to enable watches.
type MemoryInfo ¶
type MemoryInfo struct { GlobalUsed int64 ECCErrors ECCErrorsInfo }
type MetricGroup ¶
type MetricGroup struct {
// contains filtered or unexported fields
}
func GetSupportedMetricGroups ¶
func GetSupportedMetricGroups(grpid uint) ([]MetricGroup, error)
Get all of the profiling metric groups for a given GPU group.
type MigEntityInfo ¶
type MigHierarchyInfo_v2 ¶
type MigHierarchyInfo_v2 struct { Entity GroupEntityPair Parent GroupEntityPair Info MigEntityInfo }
type MigHierarchy_v2 ¶
type MigHierarchy_v2 struct { Version uint Count uint EntityList [C.DCGM_MAX_HIERARCHY_INFO]MigHierarchyInfo_v2 }
func GetGpuInstanceHierarchy ¶
func GetGpuInstanceHierarchy() (hierarchy MigHierarchy_v2, err error)
type P2PLink ¶
type P2PLink struct { GPU uint BusID string Link P2PLinkType }
func GetDeviceTopology ¶
GetDeviceTopology returns device topology corresponding to the gpuId
type P2PLinkType ¶
type P2PLinkType uint
const ( P2PLinkUnknown P2PLinkType = iota P2PLinkCrossCPU P2PLinkSameCPU P2PLinkHostBridge P2PLinkMultiSwitch P2PLinkSingleSwitch P2PLinkSameBoard SingleNVLINKLink TwoNVLINKLinks ThreeNVLINKLinks FourNVLINKLinks )
func (P2PLinkType) PCIPaths ¶
func (l P2PLinkType) PCIPaths() string
type PCIStatusInfo ¶
type PCIStatusInfo struct { BAR1Used int64 // MB Throughput PCIThroughputInfo FBUsed int64 }
type PCIThroughputInfo ¶
type PolicyViolation ¶
type ProcessInfo ¶
type ProcessInfo struct { GPU uint PID uint Name string ProcessUtilization ProcessUtilInfo PCI PCIStatusInfo Memory MemoryInfo GpuUtilization UtilizationInfo Clocks ClockInfo Violations ViolationTime XIDErrors XIDErrorInfo }
func GetProcessInfo ¶
func GetProcessInfo(group GroupHandle, pid uint) ([]ProcessInfo, error)
GetProcessInfo provides detailed per GPU stats for this process
type ProcessUtilInfo ¶
type SystemWatch ¶
type UtilizationInfo ¶
type ViolationTime ¶
type ViolationTime struct { Power *uint64 Thermal *uint64 Reliability *uint64 BoardLimit *uint64 LowUtilization *uint64 SyncBoost *uint64 }
ViolationTime measures amount of time (in ms) GPU was at reduced clocks