first commit

This commit is contained in:
2025-12-16 17:56:13 +11:00
commit 2da0e4f030
70 changed files with 11317 additions and 0 deletions

312
values.yaml Normal file
View File

@@ -0,0 +1,312 @@
# NFD related configs
# schema reference: https://github.com/kubernetes-sigs/node-feature-discovery/blob/release-0.16/deployment/helm/node-feature-discovery/values.yaml
node-feature-discovery:
# -- Set to true/false to enable/disable the installation of node feature discovery (NFD) operator
enabled: true
worker:
# -- Set tolerations for NFD worker daemonset
tolerations:
- key: "amd-dcm"
operator: "Equal"
value: "up"
effect: "NoExecute"
# -- Set nodeSelector for NFD worker daemonset
nodeSelector: {}
# KMM related configs
kmm:
# -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator
enabled: true
# -- Default NFD rule will detect amd gpu based on pci vendor ID
installdefaultNFDRule: true
# -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart
upgradeCRD: true
crds:
defaultCR:
# -- Deploy default DeviceConfig during helm chart installation
install: true
# -- Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig
upgrade: false
deviceConfig:
spec:
# -- Set node selector for the default DeviceConfig
selector:
feature.node.kubernetes.io/amd-gpu: "true"
driver:
# -- enable/disable out-of-tree driver management, set to false to use inbox driver
enable: false
# -- enable/disable putting a blacklist amdgpu entry in modprobe config, which requires node labeller to run
blacklist: false
# -- image repository to store out-of-tree driver image, DO NOT put image tag since operator automatically manage it for users
image: "docker.io/myUserName/driverImage"
# -- image pull secret for pull/push access of the driver image repository, input secret name like {"name": "mysecret"}
imageRegistrySecret: {}
imageRegistryTLS:
# -- set to true to use plain HTTP for driver image repository
insecure: false
# -- set to true to skip TLS validation for driver image repository
insecureSkipTLSVerify: false
# -- specify an out-of-tree driver version to install
version: "6.4"
# -- specify the secrets to sign the out-of-tree kernel module inside driver image for secure boot, e.g. input private / public key secret {"keySecret":{"name":"privateKeySecret"},"certSecret":{"name":"publicKeySecret"}}
imageSign: {}
# -- configure the out-of-tree driver image build within the cluster. e.g. {"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"insecure":"false","insecureSkipTLSVerify":"false"}}}
imageBuild: {}
# -- configure driver tolerations so that operator can manage out-of-tree drivers on tainted nodes
tolerations: []
upgradePolicy:
# -- enable/disable automatic driver upgrade feature
enable: true
# -- how many nodes can be upgraded in parallel
maxParallelUpgrades: 3
# -- maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state
maxUnavailableNodes: 25%
# -- whether reboot each worker node or not during the driver upgrade
rebootRequired: true
nodeDrainPolicy:
# -- whether force draining is allowed or not
force: true
# -- the length of time in seconds to wait before giving up drain, zero means infinite
timeoutSeconds: 300
# -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period
gracePeriodSeconds: -1
podDeletionPolicy:
# -- whether force deletion is allowed or not
force: true
# -- the length of time in seconds to wait before giving up on pod deletion, zero means infinite
timeoutSeconds: 300
# -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period
gracePeriodSeconds: -1
commonConfig:
# -- init container image
initContainerImage: busybox:1.36
utilsContainer:
# -- gpu operator utility container image
image: docker.io/rocm/gpu-operator-utils:v1.4.0
# -- utility container image pull policy
imagePullPolicy: IfNotPresent
# -- utility container image pull secret, e.g. {"name": "mySecretName"}
imageRegistrySecret: {}
devicePlugin:
# -- device plugin image
devicePluginImage: rocm/k8s-device-plugin:latest
# -- device plugin image pull policy
devicePluginImagePullPolicy: IfNotPresent
# -- device plugin tolerations
devicePluginTolerations: []
# -- pass supported flags and their values while starting device plugin daemonset, e.g. {"resource_naming_strategy": "single"} or {"resource_naming_strategy": "mixed"}
devicePluginArguments: {}
# -- enable / disable node labeller
enableNodeLabeller: true
# -- node labeller image
nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest
# -- node labeller image pull policy
nodeLabellerImagePullPolicy: IfNotPresent
# -- node labeller tolerations
nodeLabellerTolerations: []
# -- pass supported labels while starting node labeller daemonset, default ["vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"], also support ["compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"]
nodeLabellerArguments: []
# -- image pull secret for device plugin and node labeller, e.g. {"name": "mySecretName"}
imageRegistrySecret: {}
upgradePolicy:
# -- the type of daemonset upgrade, RollingUpdate or OnDelete
upgradeStrategy: RollingUpdate
# -- the maximum number of Pods that can be unavailable during the update process
maxUnavailable: 1
metricsExporter:
# -- enable / disable device metrics exporter
enable: true
# -- type of service for exposing metrics endpoint, ClusterIP or NodePort
serviceType: ClusterIP
# -- internal port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000).
port: 5000
# -- external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default)
nodePort: 32500
# -- metrics exporter image
image: docker.io/rocm/device-metrics-exporter:v1.4.0
# -- metrics exporter image pull policy
imagePullPolicy: "IfNotPresent"
# -- name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"}
config: {}
# -- metrics exporter tolerations
tolerations: []
# -- metrics exporter image pull secret, e.g. {"name": "pullSecretName"}
imageRegistrySecret: {}
# -- metrics exporter node selector, if not specified it will reuse spec.selector
selector: {}
upgradePolicy:
# -- the type of daemonset upgrade, RollingUpdate or OnDelete
upgradeStrategy: RollingUpdate
# -- the maximum number of Pods that can be unavailable during the update process
maxUnavailable: 1
rbacConfig:
# -- enable/disable kube rbac proxy
enable: false
# -- kube rbac proxy side car container image
image: quay.io/brancz/kube-rbac-proxy:v0.18.1
# -- disable https protecting the proxy endpoint
disableHttps: false
# -- certificate secret to mount in kube-rbac container for TLS, self signed certificates will be generated by default, e.g. {"name": "secretName"}
secret: {}
# -- reference to a configmap containing the client CA (key: ca.crt) for mTLS client validation, e.g. {"name": "configMapName"}
clientCAConfigMap: {}
staticAuthorization:
# -- enables static authorization using client certificate CN
enable: false
# -- expected CN (Common Name) from client cert (e.g., Prometheus SA identity)
clientName: ""
prometheus:
serviceMonitor:
# -- enable or disable ServiceMonitor creation
enable: false
# -- frequency to scrape metrics. Accepts values with time unit suffix: "30s", "1m", "2h", "500ms"
interval: 30s
# -- define if Prometheus should attach node metadata to the target, e.g. {"node": "true"}
attachMetadata: {}
# -- choose the metric's labels on collisions with target labels
honorLabels: true
# -- control whether the scrape endpoints honor timestamps
honorTimestamps: false
# -- additional labels to add to the ServiceMonitor
labels: {}
# -- relabelConfigs to apply to samples before ingestion
relabelings: []
# -- relabeling rules applied to individual scraped metrics
metricRelabelings: []
# -- optional Prometheus authorization configuration for accessing the endpoint
authorization: {}
# -- TLS settings used by Prometheus to connect to the metrics endpoint
tlsConfig: {}
testRunner:
# -- enable / disable test runner
enable: false
# -- test runner image
image: docker.io/rocm/test-runner:v1.4.0
# -- test runner image pull policy
imagePullPolicy: "IfNotPresent"
# -- test runner config map, e.g. {"name": "myConfigMap"}
config: {}
logsLocation:
# -- test runner internal mounted directory to save test run logs
mountPath: "/var/log/amd-test-runner"
# -- host directory to save test run logs
hostPath: "/var/log/amd-test-runner"
# -- a list of secrets that contain connectivity info to multiple cloud providers
logsExportSecrets: []
upgradePolicy:
# -- the type of daemonset upgrade, RollingUpdate or OnDelete
upgradeStrategy: RollingUpdate
# -- the maximum number of Pods that can be unavailable during the update process
maxUnavailable: 1
# -- test runner tolerations
tolerations: []
# -- test runner image pull secret
imageRegistrySecret: {}
# -- test runner node selector, if not specified it will reuse spec.selector
selector: {}
configManager:
# -- enable/disable the config manager
enable: false
# -- config manager image
image: docker.io/rocm/device-config-manager:v1.4.0
# -- image pull policy for config manager image
imagePullPolicy: IfNotPresent
# -- image pull secret for config manager image, e.g. {"name": "myPullSecret"}
imageRegistrySecret: {}
# -- config map for config manager, e.g. {"name": "myConfigMap"}
config: {}
# -- node selector for config manager, if not specified it will reuse spec.selector
selector: {}
upgradePolicy:
# -- the type of daemonset upgrade, RollingUpdate or OnDelete
upgradeStrategy: RollingUpdate
# -- the maximum number of Pods that can be unavailable during the update process
maxUnavailable: 1
# -- config manager tolerations
configManagerTolerations: []
# AMD GPU operator controller related configs
controllerManager:
manager:
args:
- --config=controller_manager_config.yaml
containerSecurityContext:
allowPrivilegeEscalation: false
image:
# -- AMD GPU operator controller manager image repository
repository: docker.io/rocm/gpu-operator
# -- AMD GPU operator controller manager image tag
tag: v1.4.0
# -- Image pull policy for AMD GPU operator controller manager pod
imagePullPolicy: Always
# -- Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image
imagePullSecrets: ""
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Equal"
value: ""
effect: "NoSchedule"
- key: "node-role.kubernetes.io/control-plane"
operator: "Equal"
value: ""
effect: "NoSchedule"
resources:
limits:
cpu: 1000m
memory: 1Gi
requests:
cpu: 100m
memory: 256Mi
# -- Node selector for AMD GPU operator controller manager deployment
nodeSelector: {}
# -- Deployment affinity configs for controller manager
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
replicas: 1
serviceAccount:
annotations: {}
env:
simEnable: false
kmmDevicePlugin:
serviceAccount:
annotations: {}
kmmModuleLoader:
serviceAccount:
annotations: {}
kubernetesClusterDomain: cluster.local
managerConfig:
controllerManagerConfigYaml: |-
healthProbeBindAddress: :8081
metricsBindAddress: 127.0.0.1:8080
leaderElection:
enabled: true
resourceID: gpu.amd.com
metricsService:
ports:
- name: https
port: 8443
protocol: TCP
targetPort: https
type: ClusterIP
nodeLabeller:
serviceAccount:
annotations: {}
metricsExporter:
serviceAccount:
annotations: {}
testRunner:
serviceAccount:
annotations: {}
configManager:
serviceAccount:
annotations: {}
utilsContainer:
serviceAccount:
annotations: {}
global:
proxy:
env: {}