gpu-operator-charts/values.yaml

# NFD related configs
# schema reference: https://github.com/kubernetes-sigs/node-feature-discovery/blob/release-0.16/deployment/helm/node-feature-discovery/values.yaml
node-feature-discovery:
  # -- Set to true/false to enable/disable the installation of node feature discovery (NFD) operator
  enabled: true
  worker:
    # -- Set tolerations for NFD worker daemonset
    tolerations:
      - key: "amd-dcm"
        operator: "Equal"
        value: "up"
        effect: "NoExecute"
    # -- Set nodeSelector for NFD worker daemonset
    nodeSelector: {}
# KMM related configs
kmm:
  # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator
  enabled: true
# -- Default NFD rule will detect amd gpu based on pci vendor ID
installdefaultNFDRule: true
# -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart
upgradeCRD: true
crds:
  defaultCR:
    # -- Deploy default DeviceConfig during helm chart installation
    install: true
    # -- Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig
    upgrade: false
deviceConfig:
  spec:
    # -- Set node selector for the default DeviceConfig
    selector:
      feature.node.kubernetes.io/amd-gpu: "true"
    driver:
      # -- enable/disable out-of-tree driver management, set to false to use inbox driver
      enable: false
      # -- enable/disable putting a blacklist amdgpu entry in modprobe config, which requires node labeller to run
      blacklist: false
      # -- image repository to store out-of-tree driver image, DO NOT put image tag since operator automatically manage it for users
      image: "docker.io/myUserName/driverImage"
      # -- image pull secret for pull/push access of the driver image repository, input secret name like {"name": "mysecret"}
      imageRegistrySecret: {}
      imageRegistryTLS:
        # -- set to true to use plain HTTP for driver image repository
        insecure: false
        # -- set to true to skip TLS validation for driver image repository
        insecureSkipTLSVerify: false
      # -- specify an out-of-tree driver version to install
      version: "6.4"
      # -- specify the secrets to sign the out-of-tree kernel module inside driver image for secure boot, e.g. input private / public key secret {"keySecret":{"name":"privateKeySecret"},"certSecret":{"name":"publicKeySecret"}}
      imageSign: {}
      # -- configure the out-of-tree driver image build within the cluster. e.g. {"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"insecure":"false","insecureSkipTLSVerify":"false"}}}
      imageBuild: {}
      # -- configure driver tolerations so that operator can manage out-of-tree drivers on tainted nodes
      tolerations: []
      upgradePolicy:
        # -- enable/disable automatic driver upgrade feature
        enable: true
        # -- how many nodes can be upgraded in parallel
        maxParallelUpgrades: 3
        # -- maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state
        maxUnavailableNodes: 25%
        # -- whether reboot each worker node or not during the driver upgrade
        rebootRequired: true
        nodeDrainPolicy:
          # -- whether force draining is allowed or not
          force: true
          # -- the length of time in seconds to wait before giving up drain, zero means infinite
          timeoutSeconds: 300
          # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period
          gracePeriodSeconds: -1
        podDeletionPolicy:
          # -- whether force deletion is allowed or not
          force: true
          # -- the length of time in seconds to wait before giving up on pod deletion, zero means infinite
          timeoutSeconds: 300
          # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period
          gracePeriodSeconds: -1
    commonConfig:
      # -- init container image
      initContainerImage: busybox:1.36
      utilsContainer:
        # -- gpu operator utility container image
        image: docker.io/rocm/gpu-operator-utils:v1.4.0
        # -- utility container image pull policy
        imagePullPolicy: IfNotPresent
        # -- utility container image pull secret, e.g. {"name": "mySecretName"}
        imageRegistrySecret: {}
    devicePlugin:
      # -- device plugin image
      devicePluginImage: rocm/k8s-device-plugin:latest
      # -- device plugin image pull policy
      devicePluginImagePullPolicy: IfNotPresent
      # -- device plugin tolerations
      devicePluginTolerations: []
      # -- pass supported flags and their values while starting device plugin daemonset, e.g. {"resource_naming_strategy": "single"} or {"resource_naming_strategy": "mixed"}
      devicePluginArguments: {}
      # -- enable / disable node labeller
      enableNodeLabeller: true
      # -- node labeller image
      nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest
      # -- node labeller image pull policy
      nodeLabellerImagePullPolicy: IfNotPresent
      # -- node labeller tolerations
      nodeLabellerTolerations: []
      # -- pass supported labels while starting node labeller daemonset, default ["vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"], also support ["compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"]
      nodeLabellerArguments: []
      # -- image pull secret for device plugin and node labeller, e.g. {"name": "mySecretName"}
      imageRegistrySecret: {}
      upgradePolicy:
        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
        upgradeStrategy: RollingUpdate
        # -- the maximum number of Pods that can be unavailable during the update process
        maxUnavailable: 1
    metricsExporter:
      # -- enable / disable device metrics exporter
      enable: true
      # -- type of service for exposing metrics endpoint, ClusterIP or NodePort
      serviceType: ClusterIP
      # -- internal port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000).
      port: 5000
      # -- external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default)
      nodePort: 32500
      # -- metrics exporter image
      image: docker.io/rocm/device-metrics-exporter:v1.4.0
      # -- metrics exporter image pull policy
      imagePullPolicy: "IfNotPresent"
      # -- name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"}
      config: {}
      # -- metrics exporter tolerations
      tolerations: []
      # -- metrics exporter image pull secret, e.g. {"name": "pullSecretName"}
      imageRegistrySecret: {}
      # -- metrics exporter node selector, if not specified it will reuse spec.selector
      selector: {}
      upgradePolicy:
        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
        upgradeStrategy: RollingUpdate
        # -- the maximum number of Pods that can be unavailable during the update process
        maxUnavailable: 1
      rbacConfig:
        # -- enable/disable kube rbac proxy
        enable: false
        # -- kube rbac proxy side car container image
        image: quay.io/brancz/kube-rbac-proxy:v0.18.1
        # -- disable https protecting the proxy endpoint
        disableHttps: false
        # -- certificate secret to mount in kube-rbac container for TLS, self signed certificates will be generated by default, e.g. {"name": "secretName"}
        secret: {}
        # -- reference to a configmap containing the client CA (key: ca.crt) for mTLS client validation, e.g. {"name": "configMapName"}
        clientCAConfigMap: {}
        staticAuthorization:
          # -- enables static authorization using client certificate CN
          enable: false
          # -- expected CN (Common Name) from client cert (e.g., Prometheus SA identity)
          clientName: ""
      prometheus:
        serviceMonitor:
          # -- enable or disable ServiceMonitor creation
          enable: false
          # -- frequency to scrape metrics. Accepts values with time unit suffix: "30s", "1m", "2h", "500ms"
          interval: 30s
          # -- define if Prometheus should attach node metadata to the target, e.g. {"node": "true"}
          attachMetadata: {}
          # -- choose the metric's labels on collisions with target labels
          honorLabels: true
          # -- control whether the scrape endpoints honor timestamps
          honorTimestamps: false
          # -- additional labels to add to the ServiceMonitor
          labels: {}
          # -- relabelConfigs to apply to samples before ingestion
          relabelings: []
          # -- relabeling rules applied to individual scraped metrics
          metricRelabelings: []
          # -- optional Prometheus authorization configuration for accessing the endpoint
          authorization: {}
          # -- TLS settings used by Prometheus to connect to the metrics endpoint
          tlsConfig: {}
    testRunner:
      # -- enable / disable test runner
      enable: false
      # -- test runner image
      image: docker.io/rocm/test-runner:v1.4.0
      # -- test runner image pull policy
      imagePullPolicy: "IfNotPresent"
      # -- test runner config map, e.g. {"name": "myConfigMap"}
      config: {}
      logsLocation:
        # -- test runner internal mounted directory to save test run logs
        mountPath: "/var/log/amd-test-runner"
        # -- host directory to save test run logs
        hostPath: "/var/log/amd-test-runner"
        # -- a list of secrets that contain connectivity info to multiple cloud providers
        logsExportSecrets: []
      upgradePolicy:
        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
        upgradeStrategy: RollingUpdate
        # -- the maximum number of Pods that can be unavailable during the update process
        maxUnavailable: 1
      # -- test runner tolerations
      tolerations: []
      # -- test runner image pull secret
      imageRegistrySecret: {}
      # -- test runner node selector, if not specified it will reuse spec.selector
      selector: {}
    configManager:
      # -- enable/disable the config manager
      enable: false
      # -- config manager image
      image: docker.io/rocm/device-config-manager:v1.4.0
      # -- image pull policy for config manager image
      imagePullPolicy: IfNotPresent
      # -- image pull secret for config manager image, e.g. {"name": "myPullSecret"}
      imageRegistrySecret: {}
      # -- config map for config manager, e.g. {"name": "myConfigMap"}
      config: {}
      # -- node selector for config manager, if not specified it will reuse spec.selector
      selector: {}
      upgradePolicy:
        # -- the type of daemonset upgrade, RollingUpdate or OnDelete
        upgradeStrategy: RollingUpdate
        # -- the maximum number of Pods that can be unavailable during the update process
        maxUnavailable: 1
      # -- config manager tolerations
      configManagerTolerations: []
# AMD GPU operator controller related configs
controllerManager:
  manager:
    args:
      - --config=controller_manager_config.yaml
    containerSecurityContext:
      allowPrivilegeEscalation: false
    image:
      # -- AMD GPU operator controller manager image repository
      repository: docker.io/rocm/gpu-operator
      # -- AMD GPU operator controller manager image tag
      tag: v1.4.0
    # -- Image pull policy for AMD GPU operator controller manager pod
    imagePullPolicy: Always
    # -- Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image
    imagePullSecrets: ""
    tolerations:
      - key: "node-role.kubernetes.io/master"
        operator: "Equal"
        value: ""
        effect: "NoSchedule"
      - key: "node-role.kubernetes.io/control-plane"
        operator: "Equal"
        value: ""
        effect: "NoSchedule"
    resources:
      limits:
        cpu: 1000m
        memory: 1Gi
      requests:
        cpu: 100m
        memory: 256Mi
  # -- Node selector for AMD GPU operator controller manager deployment
  nodeSelector: {}
  # -- Deployment affinity configs for controller manager
  affinity:
    nodeAffinity:
      preferredDuringSchedulingIgnoredDuringExecution:
        - weight: 1
          preference:
            matchExpressions:
              - key: node-role.kubernetes.io/control-plane
                operator: Exists
  replicas: 1
  serviceAccount:
    annotations: {}
  env:
    simEnable: false
kmmDevicePlugin:
  serviceAccount:
    annotations: {}
kmmModuleLoader:
  serviceAccount:
    annotations: {}
kubernetesClusterDomain: cluster.local
managerConfig:
  controllerManagerConfigYaml: |-
    healthProbeBindAddress: :8081
    metricsBindAddress: 127.0.0.1:8080
    leaderElection:
      enabled: true
      resourceID: gpu.amd.com
metricsService:
  ports:
    - name: https
      port: 8443
      protocol: TCP
      targetPort: https
  type: ClusterIP
nodeLabeller:
  serviceAccount:
    annotations: {}
metricsExporter:
  serviceAccount:
    annotations: {}
testRunner:
  serviceAccount:
    annotations: {}
configManager:
  serviceAccount:
    annotations: {}
utilsContainer:
  serviceAccount:
    annotations: {}
global:
  proxy:
    env: {}