Documentation
¶
Index ¶
- Constants
- Variables
- func CalculateBPW(modelID string, memory float64, context int, kvCacheQuant KVCacheQuantisation, ...) (interface{}, error)
- func CalculateContext(modelID string, memory, bpw float64, kvCacheQuant KVCacheQuantisation, ...) (int, error)
- func CalculateVRAM(modelID string, bpw float64, context int, kvCacheQuant KVCacheQuantisation, ...) (float64, error)
- func CalculateVRAMRaw(config ModelConfig, bpwValues BPWValues, context int, numGPUs int, gqa bool) float64
- func DownloadFile(url, filePath string, headers map[string]string) error
- func EstimateVRAM(modelIdentifier, apiURL string, fitsVRAM float64) error
- func GetAvailableMemory() (float64, error)
- func GetHuggingFaceToken() string
- func GetSystemRAM() (float64, error)
- func ParseBPW(bpw string) float64
- func ParseBPWOrQuant(input string) (float64, error)
- func ParseModelIdentifier(modelID string) (string, string, error)
- func PrintFormattedTable(table QuantResultTable) string
- type BPWValues
- type ContextVRAM
- type KVCacheQuantisation
- type ModelConfig
- type OllamaModelInfo
- type QuantResult
- type QuantResultTable
Constants ¶
const (
CUDASize = 500 * 1024 * 1024 // 500 MB
)
Variables ¶
var EXL2Options []float64
EXL2Options contains the EXL2 quantisation options
var GGUFMapping = map[string]float64{
"F16": 16,
"Q8_0": 8.5,
"Q6_K": 6.59,
"Q5_K_L": 5.75,
"Q5_K_M": 5.69,
"Q5_K_S": 5.54,
"Q5_0": 5.54,
"Q4_K_L": 4.9,
"Q4_K_M": 4.85,
"Q4_K_S": 4.58,
"Q4_0": 4.55,
"IQ4_NL": 4.5,
"Q3_K_L": 4.27,
"IQ4_XS": 4.25,
"Q3_K_M": 3.91,
"IQ3_M": 3.7,
"IQ3_S": 3.5,
"Q3_K_S": 3.5,
"Q2_K": 3.35,
"IQ3_XS": 3.3,
"IQ3_XXS": 3.06,
"IQ2_M": 2.7,
"IQ2_S": 2.5,
"IQ2_XS": 2.31,
"IQ2_XXS": 2.06,
"IQ1_S": 1.56,
"Q2": 3.35,
"Q3": 3.5,
"Q4": 4.55,
"Q5": 5.54,
"Q6": 6.59,
"Q8": 8.5,
"FP16": 16,
}
GGUFMapping maps GGUF quantisation types to their corresponding bits per weight
Functions ¶
func CalculateBPW ¶
func CalculateBPW(modelID string, memory float64, context int, kvCacheQuant KVCacheQuantisation, quantType string, ollamaModelInfo *OllamaModelInfo) (interface{}, error)
CalculateBPW calculates the best BPW for a given memory and context constraint
func CalculateContext ¶
func CalculateContext(modelID string, memory, bpw float64, kvCacheQuant KVCacheQuantisation, ollamaModelInfo *OllamaModelInfo, topContext int) (int, error)
CalculateContext calculates the maximum context for a given memory constraint
func CalculateVRAM ¶
func CalculateVRAM(modelID string, bpw float64, context int, kvCacheQuant KVCacheQuantisation, ollamaModelInfo *OllamaModelInfo) (float64, error)
CalculateVRAM calculates the VRAM usage for a given model and configuration
func CalculateVRAMRaw ¶
func CalculateVRAMRaw(config ModelConfig, bpwValues BPWValues, context int, numGPUs int, gqa bool) float64
CalculateVRAMRaw calculates the raw VRAM usage
func DownloadFile ¶
DownloadFile downloads a file from a URL and saves it to the specified path
func EstimateVRAM ¶ added in v1.26.0
func GetAvailableMemory ¶ added in v1.24.0
func GetHuggingFaceToken ¶ added in v1.27.9
func GetHuggingFaceToken() string
func GetSystemRAM ¶ added in v1.24.0
func ParseBPWOrQuant ¶
parseBPWOrQuant takes a string and returns a float64 BPW value
func ParseModelIdentifier ¶ added in v1.27.19
ParseModelIdentifier parses a model identifier into its base name and quantisation level. Handles both HuggingFace (contains "/") and Ollama (contains ":" or neither) formats.
func PrintFormattedTable ¶ added in v1.24.0
func PrintFormattedTable(table QuantResultTable) string
PrintFormattedTable updates the table formatting with better descriptions
Types ¶
type BPWValues ¶
BPWValues represents the bits per weight values for different components
func GetBPWValues ¶
func GetBPWValues(bpw float64, kvCacheQuant KVCacheQuantisation) BPWValues
GetBPWValues calculates the BPW values based on the input
type ContextVRAM ¶ added in v1.24.0
type KVCacheQuantisation ¶
type KVCacheQuantisation string
KVCacheQuantisation represents the quantisation type for the k/v context cache
const ( KVCacheFP16 KVCacheQuantisation = "fp16" KVCacheQ8_0 KVCacheQuantisation = "q8_0" KVCacheQ4_0 KVCacheQuantisation = "q4_0" )
type ModelConfig ¶
type ModelConfig struct { NumParams float64 `json:"num_params"` MaxPositionEmbeddings int `json:"max_position_embeddings"` NumHiddenLayers int `json:"num_hidden_layers"` HiddenSize int `json:"hidden_size"` NumKeyValueHeads int `json:"num_key_value_heads"` NumAttentionHeads int `json:"num_attention_heads"` IntermediateSize int `json:"intermediate_size"` VocabSize int `json:"vocab_size"` }
ModelConfig represents the configuration of a model
func GetModelConfig ¶
func GetModelConfig(modelID string) (ModelConfig, error)
GetModelConfig retrieves and parses the model configuration
type OllamaModelInfo ¶ added in v1.26.0
type OllamaModelInfo struct { Details struct { ParameterSize string `json:"parameter_size"` QuantizationLevel string `json:"quantisation_level"` Family string `json:"family"` Families []string `json:"families"` } `json:"details"` ModelInfo map[string]interface{} `json:"model_info"` }
func FetchOllamaModelInfo ¶ added in v1.26.0
func FetchOllamaModelInfo(apiURL, modelName string) (*OllamaModelInfo, error)
type QuantResult ¶ added in v1.24.0
type QuantResult struct { QuantType string BPW float64 Contexts map[int]ContextVRAM }
Update the QuantResult struct
type QuantResultTable ¶ added in v1.24.0
type QuantResultTable struct { ModelID string Results []QuantResult FitsVRAM float64 }
QuantResultTable represents a table of VRAM estimation results
func GenerateQuantTable ¶ added in v1.24.0
func GenerateQuantTable(modelID string, fitsVRAM float64, ollamaModelInfo *OllamaModelInfo, topContext int) (QuantResultTable, error)