Documentation
¶
Overview ¶
Package collector implements different collectors of the exporter
Index ¶
- Constants
- Variables
- func DisableDefaultCollectors()
- func IsNoDataError(err error) bool
- func KernelStringToNumeric(ver string) int64
- func KernelVersion() (int64, error)
- func NewCgroupCollector(logger *slog.Logger, cgManager *cgroupManager, opts cgroupOpts) (*cgroupCollector, error)
- func NewCgroupManager(name string, logger *slog.Logger) (*cgroupManager, error)
- func NewEbpfCollector(logger *slog.Logger, cgManager *cgroupManager) (*ebpfCollector, error)
- func NewPerfCollector(logger *slog.Logger, cgManager *cgroupManager) (*perfCollector, error)
- func NewRDMACollector(logger *slog.Logger, cgManager *cgroupManager) (*rdmaCollector, error)
- func RegisterCollector(collector string, isDefaultEnabled bool, ...)
- func SanitizeMetricName(metricName string) string
- func TargetsHandlerFor(discoverer *CEEMSAlloyTargetDiscoverer, opts promhttp.HandlerOpts) http.Handler
- type Address
- type BusID
- type CEEMSAlloyTargetDiscoverer
- type CEEMSCollector
- type CEEMSExporter
- type CEEMSExporterServer
- type Collector
- func NewCPUCollector(logger *slog.Logger) (Collector, error)
- func NewCrayPMCCollector(logger *slog.Logger) (Collector, error)
- func NewEmissionsCollector(logger *slog.Logger) (Collector, error)
- func NewHwmonCollector(logger *slog.Logger) (Collector, error)
- func NewIPMICollector(logger *slog.Logger) (Collector, error)
- func NewInfiniBandCollector(logger *slog.Logger) (Collector, error)
- func NewLibvirtCollector(logger *slog.Logger) (Collector, error)
- func NewMeminfoCollector(logger *slog.Logger) (Collector, error)
- func NewNetdevCollector(logger *slog.Logger) (Collector, error)
- func NewRaplCollector(logger *slog.Logger) (Collector, error)
- func NewRedfishCollector(logger *slog.Logger) (Collector, error)
- func NewSlurmCollector(logger *slog.Logger) (Collector, error)
- type Config
- type Device
- type DeviceAttrs
- type DeviceAttrsShared
- type Devices
- type Domain
- type GPU
- type HostDev
- type Ksyms
- type MIGDevice
- type MIGDevices
- type MIGInstance
- type MIGMode
- type Memory
- type NVIDIASMILog
- type PMCDomain
- type Source
- type Target
- type VirtMode
- type WebConfig
Constants ¶
const CEEMSExporterAppName = "ceems_exporter"
CEEMSExporterAppName is kingpin app name.
const Namespace = "ceems"
Namespace defines the common namespace to be used by all metrics.
Variables ¶
var CEEMSExporterApp = *kingpin.New(
CEEMSExporterAppName,
"Prometheus Exporter to export compute (job, VM, pod) resource usage metrics.",
)
CEEMSExporterApp is kingpin CLI app.
var (
ErrIPMIUnavailable = errors.New("IPMI Power readings not Active")
)
Custom errors.
var ErrNoData = errors.New("collector returned no data")
ErrNoData indicates the collector found no data to collect, but had no other error.
Functions ¶
func DisableDefaultCollectors ¶
func DisableDefaultCollectors()
DisableDefaultCollectors sets the collector state to false for all collectors which have not been explicitly enabled on the command line.
func IsNoDataError ¶
func IsNoDataError(err error) bool
IsNoDataError returns true if error is ErrNoData.
func KernelStringToNumeric ¶ added in v0.3.1
func KernelStringToNumeric(ver string) int64
KernelStringToNumeric converts the kernel version string into a numerical value that can be used to make comparison.
func KernelVersion ¶ added in v0.3.1
func KernelVersion() (int64, error)
KernelVersion returns kernel version of current host.
func NewCgroupCollector ¶ added in v0.3.1
func NewCgroupCollector(logger *slog.Logger, cgManager *cgroupManager, opts cgroupOpts) (*cgroupCollector, error)
NewCgroupCollector returns a new cgroupCollector exposing a summary of cgroups.
func NewCgroupManager ¶ added in v0.3.1
func NewCgroupManager(name string, logger *slog.Logger) (*cgroupManager, error)
NewCgroupManager returns an instance of cgroupManager based on resource manager.
func NewEbpfCollector ¶ added in v0.3.1
func NewEbpfCollector(logger *slog.Logger, cgManager *cgroupManager) (*ebpfCollector, error)
NewEbpfCollector returns a new instance of ebpf collector.
func NewPerfCollector ¶ added in v0.3.0
func NewPerfCollector(logger *slog.Logger, cgManager *cgroupManager) (*perfCollector, error)
NewPerfCollector returns a new perf based collector, it creates a profiler per compute unit.
func NewRDMACollector ¶ added in v0.4.0
func NewRDMACollector(logger *slog.Logger, cgManager *cgroupManager) (*rdmaCollector, error)
NewRDMACollector returns a new Collector exposing RAPL metrics.
func RegisterCollector ¶
func RegisterCollector(
collector string,
isDefaultEnabled bool,
factory func(logger *slog.Logger) (Collector, error),
)
RegisterCollector registers collector into collector factory.
func SanitizeMetricName ¶
func SanitizeMetricName(metricName string) string
SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores.
OpenMetrics and the Prometheus exposition format require the metric name to consist only of alphanumericals and "_", ":" and they must not start with digits. Since colons in MetricFamily are reserved to signal that the MetricFamily is the result of a calculation or aggregation of a general purpose monitoring system, colons will be replaced as well.
Note: If not subsequently prepending a namespace and/or subsystem (e.g., with prometheus.BuildFQName), the caller must ensure that the supplied metricName does not begin with a digit.
func TargetsHandlerFor ¶ added in v0.4.0
func TargetsHandlerFor(discoverer *CEEMSAlloyTargetDiscoverer, opts promhttp.HandlerOpts) http.Handler
TargetsHandlerFor returns http.Handler for Alloy targets.
Types ¶
type Address ¶ added in v0.4.0
type Address struct {
XMLName xml.Name `xml:"address"`
UUID string `xml:"uuid,attr"`
Type string `xml:"type,attr"`
Domain string `xml:"domain,attr"`
Bus string `xml:"bus,attr"`
Slot string `xml:"slot,attr"`
Function string `xml:"function,attr"`
}
type BusID ¶ added in v0.4.0
type BusID struct {
// contains filtered or unexported fields
}
BusID is a struct that contains PCI bus address of GPU device.
type CEEMSAlloyTargetDiscoverer ¶ added in v0.4.0
type CEEMSAlloyTargetDiscoverer struct {
// contains filtered or unexported fields
}
func NewAlloyTargetDiscoverer ¶ added in v0.4.0
func NewAlloyTargetDiscoverer(logger *slog.Logger) (*CEEMSAlloyTargetDiscoverer, error)
NewAlloyTargetDiscoverer returns a new HTTP alloy discoverer.
type CEEMSCollector ¶
type CEEMSCollector struct {
Collectors map[string]Collector
// contains filtered or unexported fields
}
CEEMSCollector implements the prometheus.Collector interface.
func NewCEEMSCollector ¶
func NewCEEMSCollector(logger *slog.Logger) (*CEEMSCollector, error)
NewCEEMSCollector creates a new CEEMSCollector.
func (CEEMSCollector) Close ¶ added in v0.3.0
func (n CEEMSCollector) Close(ctx context.Context) error
Close stops all the collectors and release system resources.
type CEEMSExporter ¶
type CEEMSExporter struct {
App kingpin.Application
// contains filtered or unexported fields
}
CEEMSExporter represents the `ceems_exporter` cli.
func NewCEEMSExporter ¶
func NewCEEMSExporter() (*CEEMSExporter, error)
NewCEEMSExporter returns a new CEEMSExporter instance.
type CEEMSExporterServer ¶ added in v0.3.0
type CEEMSExporterServer struct {
// contains filtered or unexported fields
}
CEEMSExporterServer struct implements HTTP server for exporter.
func NewCEEMSExporterServer ¶ added in v0.3.0
func NewCEEMSExporterServer(c *Config) (*CEEMSExporterServer, error)
NewCEEMSExporterServer creates new CEEMSExporterServer struct instance.
type Collector ¶
type Collector interface {
// Get new metrics and expose them via prometheus registry.
Update(ch chan<- prometheus.Metric) error
// Stops each collector and cleans up system resources
Stop(ctx context.Context) error
}
Collector is the interface a collector has to implement.
func NewCPUCollector ¶
func NewCPUCollector(logger *slog.Logger) (Collector, error)
NewCPUCollector returns a new Collector exposing kernel/system statistics.
func NewCrayPMCCollector ¶ added in v0.5.1
func NewCrayPMCCollector(logger *slog.Logger) (Collector, error)
NewCrayPMCCollector returns a new Collector exposing Cray's `pm_counters` metrics.
func NewEmissionsCollector ¶
func NewEmissionsCollector(logger *slog.Logger) (Collector, error)
NewEmissionsCollector returns a new Collector exposing emission factor metrics.
func NewHwmonCollector ¶ added in v0.7.0
func NewHwmonCollector(logger *slog.Logger) (Collector, error)
NewHwmonCollector returns a new Collector exposing /sys/class/hwmon stats (similar to lm-sensors).
func NewIPMICollector ¶
func NewIPMICollector(logger *slog.Logger) (Collector, error)
NewIPMICollector returns a new Collector exposing IMPI DCMI power metrics.
func NewInfiniBandCollector ¶ added in v0.7.0
func NewInfiniBandCollector(logger *slog.Logger) (Collector, error)
NewInfiniBandCollector returns a new Collector exposing InfiniBand stats.
func NewLibvirtCollector ¶ added in v0.4.0
func NewLibvirtCollector(logger *slog.Logger) (Collector, error)
NewLibvirtCollector returns a new libvirt collector exposing a summary of cgroups.
func NewMeminfoCollector ¶
func NewMeminfoCollector(logger *slog.Logger) (Collector, error)
NewMeminfoCollector returns a new Collector exposing memory stats.
func NewNetdevCollector ¶ added in v0.7.0
func NewNetdevCollector(logger *slog.Logger) (Collector, error)
NewNetdevCollector returns a new Collector exposing node network stats.
func NewRaplCollector ¶
func NewRaplCollector(logger *slog.Logger) (Collector, error)
NewRaplCollector returns a new Collector exposing RAPL metrics.
func NewRedfishCollector ¶ added in v0.5.0
func NewRedfishCollector(logger *slog.Logger) (Collector, error)
NewRedfishCollector returns a new Collector to fetch power usage from redfish API.
func NewSlurmCollector ¶
func NewSlurmCollector(logger *slog.Logger) (Collector, error)
NewSlurmCollector returns a new Collector exposing a summary of cgroups.
type Config ¶ added in v0.3.0
type Config struct {
Logger *slog.Logger
Collector *CEEMSCollector
Discoverer *CEEMSAlloyTargetDiscoverer
Web WebConfig
}
Config makes a server config.
type Device ¶
type Device struct {
// contains filtered or unexported fields
}
Device contains the details of GPU devices.
func GetAMDGPUDevices ¶
func GetAMDGPUDevices(logger *slog.Logger) ([]Device, error)
GetAMDGPUDevices returns all GPU devices using rocm-smi command Example output: bash-4.4$ rocm-smi --showproductname --showserial --showbus --csv device,Serial Number,Card series,Card model,Card vendor,Card SKU card0,20170000800c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. AMD/ATI,D16317 card1,20170003580c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. AMD/ATI,D16317 card2,20180003050c,0000:C5:00.0,deon Instinct MI50 32GB,0x0834,Advanced Micro Devices Inc. AMD/ATI,D16317.
func GetGPUDevices ¶
func GetGPUDevices(gpuType string, logger *slog.Logger) ([]Device, error)
GetGPUDevices returns GPU devices.
func GetNvidiaGPUDevices ¶
func GetNvidiaGPUDevices(logger *slog.Logger) ([]Device, error)
GetNvidiaGPUDevices returns all physical or MIG devices using nvidia-smi command Example output: bash-4.4$ nvidia-smi --query-gpu=name,uuid --format=csv name, uuid Tesla V100-SXM2-32GB, GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e Tesla V100-SXM2-32GB, GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3
Here we are using nvidia-smi to avoid having build issues if we use nvml go bindings. This way we dont have deps on nvidia stuff and keep exporter simple.
NOTE: This command does not return MIG devices.
func (*Device) CompareBusID ¶ added in v0.4.0
func (d *Device) CompareBusID(id string) bool
CompareBusID compares the provided bus ID with device bus ID and returns true if they match and false in all other cases.
type DeviceAttrs ¶ added in v0.4.0
type DeviceAttrs struct {
XMLName xml.Name `xml:"device_attributes"`
Shared DeviceAttrsShared `xml:"shared"`
}
type DeviceAttrsShared ¶ added in v0.4.0
type DeviceAttrsShared struct {
XMLName xml.Name `xml:"shared"`
SMCount uint64 `xml:"multiprocessor_count"`
CECount uint64 `xml:"copy_engine_count"`
EncCount uint64 `xml:"encoder_count"`
DecCount uint64 `xml:"decoder_count"`
}
type Domain ¶ added in v0.4.0
type Domain struct {
Devices Devices `xml:"devices"`
Name string `xml:"name"`
UUID string `xml:"uuid"`
}
Domain is the top level XML field for libvirt XML schema.
type GPU ¶ added in v0.4.0
type GPU struct {
XMLName xml.Name `xml:"gpu"`
ID string `xml:"id,attr"`
ProductName string `xml:"product_name"`
ProductBrand string `xml:"product_brand"`
ProductArch string `xml:"product_architecture"`
MIGMode MIGMode `xml:"mig_mode"`
VirtMode VirtMode `xml:"gpu_virtualization_mode"`
MIGDevices MIGDevices `xml:"mig_devices"`
UUID string `xml:"uuid"`
MinorNumber string `xml:"minor_number"`
}
type HostDev ¶ added in v0.4.0
type HostDev struct {
XMLName xml.Name `xml:"hostdev"`
Mode string `xml:"mode,attr"`
Type string `xml:"type,attr"`
Managed string `xml:"managed,attr"`
Model string `xml:"model,attr"`
Display string `xml:"display,attr"`
Source Source `xml:"source"`
Address Address `xml:"address"`
}
type Ksyms ¶ added in v0.3.1
type Ksyms struct {
// contains filtered or unexported fields
}
Ksyms is a structure for kernel symbols.
func NewKsyms ¶ added in v0.3.1
func NewKsyms() (*Ksyms, error)
NewKsyms creates a new Ksyms structure (by reading procfs/kallsyms).
func (*Ksyms) GetArchSpecificName ¶ added in v0.3.1
func (k *Ksyms) GetArchSpecificName(name string) (string, error)
GetArchSpecificName returns architecture specific symbol (if exists) of a given kernel symbol.
func (*Ksyms) IsAvailable ¶ added in v0.3.1
func (k *Ksyms) IsAvailable(name string) bool
IsAvailable returns true if the given name is available on current kernel.
type MIGDevice ¶ added in v0.4.0
type MIGDevice struct {
XMLName xml.Name `xml:"mig_device"`
Index uint64 `xml:"index"`
GPUInstID uint64 `xml:"gpu_instance_id"`
ComputeInstID uint64 `xml:"compute_instance_id"`
DeviceAttrs DeviceAttrs `xml:"device_attributes"`
FBMemory Memory `xml:"fb_memory_usage"`
Bar1Memory Memory `xml:"bar1_memory_usage"`
}
type MIGDevices ¶ added in v0.4.0
type MIGDevices struct {
XMLName xml.Name `xml:"mig_devices"`
Devices []MIGDevice `xml:"mig_device"`
}
type MIGInstance ¶ added in v0.4.0
type MIGInstance struct {
// contains filtered or unexported fields
}
type MIGMode ¶ added in v0.4.0
type MIGMode struct {
XMLName xml.Name `xml:"mig_mode"`
CurrentMIG string `xml:"current_mig"`
}
type NVIDIASMILog ¶ added in v0.4.0
type NVIDIASMILog struct {
XMLName xml.Name `xml:"nvidia_smi_log"`
GPUs []GPU `xml:"gpu"`
}
type PMCDomain ¶ added in v0.5.1
type PMCDomain struct {
Name string // name of PM counter domain zone from filename
Path string // filesystem path of PM counters
}
PMCDomain stores the information for one Cray's domain PM counter.
func GetCrayPMCDomains ¶ added in v0.5.1
func GetCrayPMCDomains(fs sysfs.FS) ([]PMCDomain, error)
GetCrayPMCDomains returns a slice of Cray's `pm_counters` domains. - https://cray-hpe.github.io/docs-csm/en-10/operations/power_management/user_access_to_compute_node_power_data/
func (PMCDomain) GetEnergyJoules ¶ added in v0.5.1
func (pd PMCDomain) GetEnergyJoules() (uint64, error)
GetEnergyJoules returns the current joule value from the domain counter.
func (PMCDomain) GetPowerLimitWatts ¶ added in v0.5.1
func (pd PMCDomain) GetPowerLimitWatts() (uint64, error)
GetPowerLimitWatts returns the current power limit watt value from the domain counter.
func (PMCDomain) GetPowerWatts ¶ added in v0.5.1
func (pd PMCDomain) GetPowerWatts() (uint64, error)
GetPowerWatts returns the current watt value from the domain counter.
func (PMCDomain) GetTempCelsius ¶ added in v0.5.1
func (pd PMCDomain) GetTempCelsius() (uint64, error)
GetTempCelsius returns the current node temperature in C value from the domain counter.
type Source ¶ added in v0.4.0
type Source struct {
XMLName xml.Name `xml:"source"`
Address Address `xml:"address"`
}
type Target ¶ added in v0.4.0
type Target struct {
Targets []string `json:"targets"`
Labels map[string]string `json:"labels"`
}
type VirtMode ¶ added in v0.4.0
type VirtMode struct {
XMLName xml.Name `xml:"gpu_virtualization_mode"`
Mode string `xml:"virtualization_mode"`
HostMode string `xml:"host_vgpu_mode"`
}
type WebConfig ¶ added in v0.3.0
type WebConfig struct {
Addresses []string
WebSystemdSocket bool
WebConfigFile string
MetricsPath string
TargetsPath string
MaxRequests int
IncludeExporterMetrics bool
EnableDebugServer bool
LandingConfig *web.LandingConfig
}
WebConfig makes HTTP web config from CLI args.