# NFD related configs # schema reference: https://github.com/kubernetes-sigs/node-feature-discovery/blob/release-0.16/deployment/helm/node-feature-discovery/values.yaml node-feature-discovery: # -- Set to true/false to enable/disable the installation of node feature discovery (NFD) operator enabled: true worker: # -- Set tolerations for NFD worker daemonset tolerations: - key: "amd-dcm" operator: "Equal" value: "up" effect: "NoExecute" # -- Set nodeSelector for NFD worker daemonset nodeSelector: {} # KMM related configs kmm: # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator enabled: true # -- Default NFD rule will detect amd gpu based on pci vendor ID installdefaultNFDRule: true # -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart upgradeCRD: true crds: defaultCR: # -- Deploy default DeviceConfig during helm chart installation install: true # -- Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig upgrade: false deviceConfig: spec: # -- Set node selector for the default DeviceConfig selector: feature.node.kubernetes.io/amd-gpu: "true" driver: # -- enable/disable out-of-tree driver management, set to false to use inbox driver enable: false # -- enable/disable putting a blacklist amdgpu entry in modprobe config, which requires node labeller to run blacklist: false # -- image repository to store out-of-tree driver image, DO NOT put image tag since operator automatically manage it for users image: "docker.io/myUserName/driverImage" # -- image pull secret for pull/push access of the driver image repository, input secret name like {"name": "mysecret"} imageRegistrySecret: {} imageRegistryTLS: # -- set to true to use plain HTTP for driver image repository insecure: false # -- set to true to skip TLS validation for driver image repository insecureSkipTLSVerify: false # -- specify an out-of-tree driver version to install version: "6.4" # -- specify the secrets to sign the out-of-tree kernel module inside driver image for secure boot, e.g. input private / public key secret {"keySecret":{"name":"privateKeySecret"},"certSecret":{"name":"publicKeySecret"}} imageSign: {} # -- configure the out-of-tree driver image build within the cluster. e.g. {"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"insecure":"false","insecureSkipTLSVerify":"false"}}} imageBuild: {} # -- configure driver tolerations so that operator can manage out-of-tree drivers on tainted nodes tolerations: [] upgradePolicy: # -- enable/disable automatic driver upgrade feature enable: true # -- how many nodes can be upgraded in parallel maxParallelUpgrades: 3 # -- maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state maxUnavailableNodes: 25% # -- whether reboot each worker node or not during the driver upgrade rebootRequired: true nodeDrainPolicy: # -- whether force draining is allowed or not force: true # -- the length of time in seconds to wait before giving up drain, zero means infinite timeoutSeconds: 300 # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period gracePeriodSeconds: -1 podDeletionPolicy: # -- whether force deletion is allowed or not force: true # -- the length of time in seconds to wait before giving up on pod deletion, zero means infinite timeoutSeconds: 300 # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period gracePeriodSeconds: -1 commonConfig: # -- init container image initContainerImage: busybox:1.36 utilsContainer: # -- gpu operator utility container image image: docker.io/rocm/gpu-operator-utils:v1.4.0 # -- utility container image pull policy imagePullPolicy: IfNotPresent # -- utility container image pull secret, e.g. {"name": "mySecretName"} imageRegistrySecret: {} devicePlugin: # -- device plugin image devicePluginImage: rocm/k8s-device-plugin:latest # -- device plugin image pull policy devicePluginImagePullPolicy: IfNotPresent # -- device plugin tolerations devicePluginTolerations: [] # -- pass supported flags and their values while starting device plugin daemonset, e.g. {"resource_naming_strategy": "single"} or {"resource_naming_strategy": "mixed"} devicePluginArguments: {} # -- enable / disable node labeller enableNodeLabeller: true # -- node labeller image nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest # -- node labeller image pull policy nodeLabellerImagePullPolicy: IfNotPresent # -- node labeller tolerations nodeLabellerTolerations: [] # -- pass supported labels while starting node labeller daemonset, default ["vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"], also support ["compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"] nodeLabellerArguments: [] # -- image pull secret for device plugin and node labeller, e.g. {"name": "mySecretName"} imageRegistrySecret: {} upgradePolicy: # -- the type of daemonset upgrade, RollingUpdate or OnDelete upgradeStrategy: RollingUpdate # -- the maximum number of Pods that can be unavailable during the update process maxUnavailable: 1 metricsExporter: # -- enable / disable device metrics exporter enable: true # -- type of service for exposing metrics endpoint, ClusterIP or NodePort serviceType: ClusterIP # -- internal port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000). port: 5000 # -- external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default) nodePort: 32500 # -- metrics exporter image image: docker.io/rocm/device-metrics-exporter:v1.4.0 # -- metrics exporter image pull policy imagePullPolicy: "IfNotPresent" # -- name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"} config: {} # -- metrics exporter tolerations tolerations: [] # -- metrics exporter image pull secret, e.g. {"name": "pullSecretName"} imageRegistrySecret: {} # -- metrics exporter node selector, if not specified it will reuse spec.selector selector: {} upgradePolicy: # -- the type of daemonset upgrade, RollingUpdate or OnDelete upgradeStrategy: RollingUpdate # -- the maximum number of Pods that can be unavailable during the update process maxUnavailable: 1 rbacConfig: # -- enable/disable kube rbac proxy enable: false # -- kube rbac proxy side car container image image: quay.io/brancz/kube-rbac-proxy:v0.18.1 # -- disable https protecting the proxy endpoint disableHttps: false # -- certificate secret to mount in kube-rbac container for TLS, self signed certificates will be generated by default, e.g. {"name": "secretName"} secret: {} # -- reference to a configmap containing the client CA (key: ca.crt) for mTLS client validation, e.g. {"name": "configMapName"} clientCAConfigMap: {} staticAuthorization: # -- enables static authorization using client certificate CN enable: false # -- expected CN (Common Name) from client cert (e.g., Prometheus SA identity) clientName: "" prometheus: serviceMonitor: # -- enable or disable ServiceMonitor creation enable: false # -- frequency to scrape metrics. Accepts values with time unit suffix: "30s", "1m", "2h", "500ms" interval: 30s # -- define if Prometheus should attach node metadata to the target, e.g. {"node": "true"} attachMetadata: {} # -- choose the metric's labels on collisions with target labels honorLabels: true # -- control whether the scrape endpoints honor timestamps honorTimestamps: false # -- additional labels to add to the ServiceMonitor labels: {} # -- relabelConfigs to apply to samples before ingestion relabelings: [] # -- relabeling rules applied to individual scraped metrics metricRelabelings: [] # -- optional Prometheus authorization configuration for accessing the endpoint authorization: {} # -- TLS settings used by Prometheus to connect to the metrics endpoint tlsConfig: {} testRunner: # -- enable / disable test runner enable: false # -- test runner image image: docker.io/rocm/test-runner:v1.4.0 # -- test runner image pull policy imagePullPolicy: "IfNotPresent" # -- test runner config map, e.g. {"name": "myConfigMap"} config: {} logsLocation: # -- test runner internal mounted directory to save test run logs mountPath: "/var/log/amd-test-runner" # -- host directory to save test run logs hostPath: "/var/log/amd-test-runner" # -- a list of secrets that contain connectivity info to multiple cloud providers logsExportSecrets: [] upgradePolicy: # -- the type of daemonset upgrade, RollingUpdate or OnDelete upgradeStrategy: RollingUpdate # -- the maximum number of Pods that can be unavailable during the update process maxUnavailable: 1 # -- test runner tolerations tolerations: [] # -- test runner image pull secret imageRegistrySecret: {} # -- test runner node selector, if not specified it will reuse spec.selector selector: {} configManager: # -- enable/disable the config manager enable: false # -- config manager image image: docker.io/rocm/device-config-manager:v1.4.0 # -- image pull policy for config manager image imagePullPolicy: IfNotPresent # -- image pull secret for config manager image, e.g. {"name": "myPullSecret"} imageRegistrySecret: {} # -- config map for config manager, e.g. {"name": "myConfigMap"} config: {} # -- node selector for config manager, if not specified it will reuse spec.selector selector: {} upgradePolicy: # -- the type of daemonset upgrade, RollingUpdate or OnDelete upgradeStrategy: RollingUpdate # -- the maximum number of Pods that can be unavailable during the update process maxUnavailable: 1 # -- config manager tolerations configManagerTolerations: [] # AMD GPU operator controller related configs controllerManager: manager: args: - --config=controller_manager_config.yaml containerSecurityContext: allowPrivilegeEscalation: false image: # -- AMD GPU operator controller manager image repository repository: docker.io/rocm/gpu-operator # -- AMD GPU operator controller manager image tag tag: v1.4.0 # -- Image pull policy for AMD GPU operator controller manager pod imagePullPolicy: Always # -- Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image imagePullSecrets: "" tolerations: - key: "node-role.kubernetes.io/master" operator: "Equal" value: "" effect: "NoSchedule" - key: "node-role.kubernetes.io/control-plane" operator: "Equal" value: "" effect: "NoSchedule" resources: limits: cpu: 1000m memory: 1Gi requests: cpu: 100m memory: 256Mi # -- Node selector for AMD GPU operator controller manager deployment nodeSelector: {} # -- Deployment affinity configs for controller manager affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 preference: matchExpressions: - key: node-role.kubernetes.io/control-plane operator: Exists replicas: 1 serviceAccount: annotations: {} env: simEnable: false kmmDevicePlugin: serviceAccount: annotations: {} kmmModuleLoader: serviceAccount: annotations: {} kubernetesClusterDomain: cluster.local managerConfig: controllerManagerConfigYaml: |- healthProbeBindAddress: :8081 metricsBindAddress: 127.0.0.1:8080 leaderElection: enabled: true resourceID: gpu.amd.com metricsService: ports: - name: https port: 8443 protocol: TCP targetPort: https type: ClusterIP nodeLabeller: serviceAccount: annotations: {} metricsExporter: serviceAccount: annotations: {} testRunner: serviceAccount: annotations: {} configManager: serviceAccount: annotations: {} utilsContainer: serviceAccount: annotations: {} global: proxy: env: {}