commit 2da0e4f0305d3b61d6b7e11fc9932592ea8426e5 Author: Conan Scott Date: Tue Dec 16 17:56:13 2025 +1100 first commit diff --git a/Chart.lock b/Chart.lock new file mode 100644 index 0000000..f079b9d --- /dev/null +++ b/Chart.lock @@ -0,0 +1,9 @@ +dependencies: +- name: node-feature-discovery + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts + version: 0.16.1 +- name: kmm + repository: file://./charts/kmm + version: v1.0.0 +digest: sha256:f9a315dd2ce3d515ebf28c8e9a6a82158b493ca2686439ec381487761261b597 +generated: "2025-10-04T03:46:48.892453365Z" diff --git a/Chart.yaml b/Chart.yaml new file mode 100644 index 0000000..a34dff9 --- /dev/null +++ b/Chart.yaml @@ -0,0 +1,32 @@ +apiVersion: v2 +appVersion: v1.4.0 +dependencies: +- condition: node-feature-discovery.enabled + name: node-feature-discovery + repository: https://kubernetes-sigs.github.io/node-feature-discovery/charts + version: v0.16.1 +- condition: kmm.enabled + name: kmm + repository: file://./charts/kmm + version: v1.0.0 +description: AMD GPU Operator simplifies the deployment and management of AMD Instinct + GPU accelerators within Kubernetes clusters. +home: https://github.com/ROCm/gpu-operator +icon: https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/helm/logo.png +keywords: +- kubernetes +- cluster +- hardware +- amd +- gpu +- ai +- deep learning +- monitoring +kubeVersion: '>= 1.29.0-0' +maintainers: +- name: Yan Sun +name: gpu-operator-charts +sources: +- https://github.com/ROCm/gpu-operator +type: application +version: v1.4.0 diff --git a/README.md b/README.md new file mode 100644 index 0000000..11d975a --- /dev/null +++ b/README.md @@ -0,0 +1,308 @@ +# AMD GPU Operator + +:book: GPU Operator Documentation Site: https://instinct.docs.amd.com/projects/gpu-operator + +## Introduction + +AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU accelerators within Kubernetes clusters. This project enables seamless configuration and operation of GPU-accelerated workloads, including machine learning, Generative AI, and other GPU-intensive applications. + +## Components + +* AMD GPU Operator Controller +* K8s Device Plugin +* K8s Node Labeller +* Device Metrics Exporter +* Device Test Runner +* Node Feature Discovery Operator +* Kernel Module Management Operator + +## Features + +* Streamlined GPU driver installation and management +* Comprehensive metrics collection and export +* Easy deployment of AMD GPU device plugin for Kubernetes +* Automated labeling of nodes with AMD GPU capabilities +* Compatibility with standard Kubernetes environments +* Efficient GPU resource allocation for containerized workloads +* GPU health monitoring and troubleshooting + +## Compatibility + +* **ROCm DKMS Compatibility**: Please refer to the [ROCM official website](https://rocm.docs.amd.com/en/latest/compatibility/compatibility-matrix.html) for the compatability matrix for ROCM driver. +* **Kubernetes**: 1.29.0+ + +## Prerequisites + +* Kubernetes v1.29.0+ +* Helm v3.2.0+ +* `kubectl` CLI tool configured to access your cluster +* [Cert Manager](https://cert-manager.io/docs/) Install it by running these commands if not already installed in the cluster: + +```bash +helm repo add jetstack https://charts.jetstack.io --force-update + +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.15.1 \ + --set crds.enabled=true +``` + +## Quick Start + +### 1. Add the AMD Helm Repository + +```bash +helm repo add rocm https://rocm.github.io/gpu-operator +helm repo update +``` + +### 2. Install the Operator + +Basic installation: + +```bash +helm install amd-gpu-operator rocm/gpu-operator-charts \ + --namespace kube-amd-gpu \ + --create-namespace \ + --version=v1.4.0 +``` + +```{note} +Installation Options + - Skip NFD installation: `--set node-feature-discovery.enabled=false` + - Skip KMM installation: `--set kmm.enabled=false` +``` + +```{warning} + It is strongly recommended to use AMD-optimized KMM images included in the operator release. +``` + +### 3. Install Custom Resource +After the installation of AMD GPU Operator: + * By default there will be a default `DeviceConfig` installed. If you are using default `DeviceConfig`, you can modify the default `DeviceConfig` to adjust the config for your own use case. `kubectl edit deviceconfigs -n kube-amd-gpu default` + * If you installed without default `DeviceConfig` (either by using `--set crds.defaultCR.install=false` or installing a chart prior to v1.3.0), you need to create the `DeviceConfig` custom resource in order to trigger the operator start to work. By preparing the `DeviceConfig` in the YAML file, you can create the resouce by running ```kubectl apply -f deviceconfigs.yaml```. + * For custom resource definition and more detailed information, please refer to [Custom Resource Installation Guide](https://dcgpu.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#install-custom-resource). + + * Potential Failures with default `DeviceConfig`: + + a. Operand pods are stuck in ```Init:0/1``` state: It means your GPU worker doesn't have inbox GPU driver loaded. We suggest check the [Driver Installation Guide]([./drivers/installation.md](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/drivers/installation.html#driver-installation-guide)) then modify the default `DeviceConfig` to ask Operator to install the out-of-tree GPU driver for your worker nodes. + `kubectl edit deviceconfigs -n kube-amd-gpu default` + + b. No operand pods showed up: It is possible that default `DeviceConfig` selector `feature.node.kubernetes.io/amd-gpu: "true"` cannot find any matched node. + * Check node label `kubectl get node -oyaml | grep -e "amd-gpu:" -e "amd-vgpu:"` + * If you are using GPU in the VM, you may need to change the default `DeviceConfig` selector to `feature.node.kubernetes.io/amd-vgpu: "true"` + * You can always customize the node selector of the `DeviceConfig`. + +### Grafana Dashboards + +Following dashboards are provided for visualizing GPU metrics collected from device-metrics-exporter: + +* Overview Dashboard: Provides a comprehensive view of the GPU cluster. +* GPU Detail Dashboard: Offers a detailed look at individual GPUs. +* Job Detail Dashboard: Presents detailed GPU usage for specific jobs in SLURM and Kubernetes environments. +* Node Detail Dashboard: Displays detailed GPU usage at the host level. + +## Support + +For bugs and feature requests, please file an issue on our [GitHub Issues](https://github.com/ROCm/gpu-operator/issues) page. + +## License + +The AMD GPU Operator is licensed under the [Apache License 2.0](LICENSE). +# gpu-operator-charts + +![Version: v1.4.0](https://img.shields.io/badge/Version-v1.4.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1.4.0](https://img.shields.io/badge/AppVersion-v1.4.0-informational?style=flat-square) + +AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU accelerators within Kubernetes clusters. + +**Homepage:** + +## Maintainers + +| Name | Email | Url | +| ---- | ------ | --- | +| Yan Sun | | | + +## Source Code + +* + +## Requirements + +Kubernetes: `>= 1.29.0-0` + +| Repository | Name | Version | +|------------|------|---------| +| file://./charts/kmm | kmm | v1.0.0 | +| https://kubernetes-sigs.github.io/node-feature-discovery/charts | node-feature-discovery | v0.16.1 | + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| controllerManager.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | Deployment affinity configs for controller manager | +| controllerManager.manager.image.repository | string | `"docker.io/rocm/gpu-operator"` | AMD GPU operator controller manager image repository | +| controllerManager.manager.image.tag | string | `"v1.4.0"` | AMD GPU operator controller manager image tag | +| controllerManager.manager.imagePullPolicy | string | `"Always"` | Image pull policy for AMD GPU operator controller manager pod | +| controllerManager.manager.imagePullSecrets | string | `""` | Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image | +| controllerManager.nodeSelector | object | `{}` | Node selector for AMD GPU operator controller manager deployment | +| crds.defaultCR.install | bool | `true` | Deploy default DeviceConfig during helm chart installation | +| crds.defaultCR.upgrade | bool | `false` | Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig | +| deviceConfig.spec.commonConfig.initContainerImage | string | `"busybox:1.36"` | init container image | +| deviceConfig.spec.commonConfig.utilsContainer.image | string | `"docker.io/rocm/gpu-operator-utils:v1.4.0"` | gpu operator utility container image | +| deviceConfig.spec.commonConfig.utilsContainer.imagePullPolicy | string | `"IfNotPresent"` | utility container image pull policy | +| deviceConfig.spec.commonConfig.utilsContainer.imageRegistrySecret | object | `{}` | utility container image pull secret, e.g. {"name": "mySecretName"} | +| deviceConfig.spec.configManager.config | object | `{}` | config map for config manager, e.g. {"name": "myConfigMap"} | +| deviceConfig.spec.configManager.configManagerTolerations | list | `[]` | config manager tolerations | +| deviceConfig.spec.configManager.enable | bool | `false` | enable/disable the config manager | +| deviceConfig.spec.configManager.image | string | `"docker.io/rocm/device-config-manager:v1.4.0"` | config manager image | +| deviceConfig.spec.configManager.imagePullPolicy | string | `"IfNotPresent"` | image pull policy for config manager image | +| deviceConfig.spec.configManager.imageRegistrySecret | object | `{}` | image pull secret for config manager image, e.g. {"name": "myPullSecret"} | +| deviceConfig.spec.configManager.selector | object | `{}` | node selector for config manager, if not specified it will reuse spec.selector | +| deviceConfig.spec.configManager.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process | +| deviceConfig.spec.configManager.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | +| deviceConfig.spec.devicePlugin.devicePluginArguments | object | `{}` | pass supported flags and their values while starting device plugin daemonset, e.g. {"resource_naming_strategy": "single"} or {"resource_naming_strategy": "mixed"} | +| deviceConfig.spec.devicePlugin.devicePluginImage | string | `"rocm/k8s-device-plugin:latest"` | device plugin image | +| deviceConfig.spec.devicePlugin.devicePluginImagePullPolicy | string | `"IfNotPresent"` | device plugin image pull policy | +| deviceConfig.spec.devicePlugin.devicePluginTolerations | list | `[]` | device plugin tolerations | +| deviceConfig.spec.devicePlugin.enableNodeLabeller | bool | `true` | enable / disable node labeller | +| deviceConfig.spec.devicePlugin.imageRegistrySecret | object | `{}` | image pull secret for device plugin and node labeller, e.g. {"name": "mySecretName"} | +| deviceConfig.spec.devicePlugin.nodeLabellerArguments | list | `[]` | pass supported labels while starting node labeller daemonset, default ["vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"], also support ["compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"] | +| deviceConfig.spec.devicePlugin.nodeLabellerImage | string | `"rocm/k8s-device-plugin:labeller-latest"` | node labeller image | +| deviceConfig.spec.devicePlugin.nodeLabellerImagePullPolicy | string | `"IfNotPresent"` | node labeller image pull policy | +| deviceConfig.spec.devicePlugin.nodeLabellerTolerations | list | `[]` | node labeller tolerations | +| deviceConfig.spec.devicePlugin.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process | +| deviceConfig.spec.devicePlugin.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | +| deviceConfig.spec.driver.blacklist | bool | `false` | enable/disable putting a blacklist amdgpu entry in modprobe config, which requires node labeller to run | +| deviceConfig.spec.driver.enable | bool | `false` | enable/disable out-of-tree driver management, set to false to use inbox driver | +| deviceConfig.spec.driver.image | string | `"docker.io/myUserName/driverImage"` | image repository to store out-of-tree driver image, DO NOT put image tag since operator automatically manage it for users | +| deviceConfig.spec.driver.imageBuild | object | `{}` | configure the out-of-tree driver image build within the cluster. e.g. {"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"insecure":"false","insecureSkipTLSVerify":"false"}}} | +| deviceConfig.spec.driver.imageRegistrySecret | object | `{}` | image pull secret for pull/push access of the driver image repository, input secret name like {"name": "mysecret"} | +| deviceConfig.spec.driver.imageRegistryTLS.insecure | bool | `false` | set to true to use plain HTTP for driver image repository | +| deviceConfig.spec.driver.imageRegistryTLS.insecureSkipTLSVerify | bool | `false` | set to true to skip TLS validation for driver image repository | +| deviceConfig.spec.driver.imageSign | object | `{}` | specify the secrets to sign the out-of-tree kernel module inside driver image for secure boot, e.g. input private / public key secret {"keySecret":{"name":"privateKeySecret"},"certSecret":{"name":"publicKeySecret"}} | +| deviceConfig.spec.driver.tolerations | list | `[]` | configure driver tolerations so that operator can manage out-of-tree drivers on tainted nodes | +| deviceConfig.spec.driver.upgradePolicy.enable | bool | `true` | enable/disable automatic driver upgrade feature | +| deviceConfig.spec.driver.upgradePolicy.maxParallelUpgrades | int | `3` | how many nodes can be upgraded in parallel | +| deviceConfig.spec.driver.upgradePolicy.maxUnavailableNodes | string | `"25%"` | maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state | +| deviceConfig.spec.driver.upgradePolicy.nodeDrainPolicy.force | bool | `true` | whether force draining is allowed or not | +| deviceConfig.spec.driver.upgradePolicy.nodeDrainPolicy.gracePeriodSeconds | int | `-1` | the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period | +| deviceConfig.spec.driver.upgradePolicy.nodeDrainPolicy.timeoutSeconds | int | `300` | the length of time in seconds to wait before giving up drain, zero means infinite | +| deviceConfig.spec.driver.upgradePolicy.podDeletionPolicy.force | bool | `true` | whether force deletion is allowed or not | +| deviceConfig.spec.driver.upgradePolicy.podDeletionPolicy.gracePeriodSeconds | int | `-1` | the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period | +| deviceConfig.spec.driver.upgradePolicy.podDeletionPolicy.timeoutSeconds | int | `300` | the length of time in seconds to wait before giving up on pod deletion, zero means infinite | +| deviceConfig.spec.driver.upgradePolicy.rebootRequired | bool | `true` | whether reboot each worker node or not during the driver upgrade | +| deviceConfig.spec.driver.version | string | `"6.4"` | specify an out-of-tree driver version to install | +| deviceConfig.spec.metricsExporter.config | object | `{}` | name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"} | +| deviceConfig.spec.metricsExporter.enable | bool | `true` | enable / disable device metrics exporter | +| deviceConfig.spec.metricsExporter.image | string | `"docker.io/rocm/device-metrics-exporter:v1.4.0"` | metrics exporter image | +| deviceConfig.spec.metricsExporter.imagePullPolicy | string | `"IfNotPresent"` | metrics exporter image pull policy | +| deviceConfig.spec.metricsExporter.imageRegistrySecret | object | `{}` | metrics exporter image pull secret, e.g. {"name": "pullSecretName"} | +| deviceConfig.spec.metricsExporter.nodePort | int | `32500` | external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default) | +| deviceConfig.spec.metricsExporter.port | int | `5000` | internal port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000). | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.attachMetadata | object | `{}` | define if Prometheus should attach node metadata to the target, e.g. {"node": "true"} | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.authorization | object | `{}` | optional Prometheus authorization configuration for accessing the endpoint | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.enable | bool | `false` | enable or disable ServiceMonitor creation | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.honorLabels | bool | `true` | choose the metric's labels on collisions with target labels | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.honorTimestamps | bool | `false` | control whether the scrape endpoints honor timestamps | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.interval | string | `"30s"` | frequency to scrape metrics. Accepts values with time unit suffix: "30s", "1m", "2h", "500ms" | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.labels | object | `{}` | additional labels to add to the ServiceMonitor | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.metricRelabelings | list | `[]` | relabeling rules applied to individual scraped metrics | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.relabelings | list | `[]` | relabelConfigs to apply to samples before ingestion | +| deviceConfig.spec.metricsExporter.prometheus.serviceMonitor.tlsConfig | object | `{}` | TLS settings used by Prometheus to connect to the metrics endpoint | +| deviceConfig.spec.metricsExporter.rbacConfig.clientCAConfigMap | object | `{}` | reference to a configmap containing the client CA (key: ca.crt) for mTLS client validation, e.g. {"name": "configMapName"} | +| deviceConfig.spec.metricsExporter.rbacConfig.disableHttps | bool | `false` | disable https protecting the proxy endpoint | +| deviceConfig.spec.metricsExporter.rbacConfig.enable | bool | `false` | enable/disable kube rbac proxy | +| deviceConfig.spec.metricsExporter.rbacConfig.image | string | `"quay.io/brancz/kube-rbac-proxy:v0.18.1"` | kube rbac proxy side car container image | +| deviceConfig.spec.metricsExporter.rbacConfig.secret | object | `{}` | certificate secret to mount in kube-rbac container for TLS, self signed certificates will be generated by default, e.g. {"name": "secretName"} | +| deviceConfig.spec.metricsExporter.rbacConfig.staticAuthorization.clientName | string | `""` | expected CN (Common Name) from client cert (e.g., Prometheus SA identity) | +| deviceConfig.spec.metricsExporter.rbacConfig.staticAuthorization.enable | bool | `false` | enables static authorization using client certificate CN | +| deviceConfig.spec.metricsExporter.selector | object | `{}` | metrics exporter node selector, if not specified it will reuse spec.selector | +| deviceConfig.spec.metricsExporter.serviceType | string | `"ClusterIP"` | type of service for exposing metrics endpoint, ClusterIP or NodePort | +| deviceConfig.spec.metricsExporter.tolerations | list | `[]` | metrics exporter tolerations | +| deviceConfig.spec.metricsExporter.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process | +| deviceConfig.spec.metricsExporter.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | +| deviceConfig.spec.selector | object | `{"feature.node.kubernetes.io/amd-gpu":"true"}` | Set node selector for the default DeviceConfig | +| deviceConfig.spec.testRunner.config | object | `{}` | test runner config map, e.g. {"name": "myConfigMap"} | +| deviceConfig.spec.testRunner.enable | bool | `false` | enable / disable test runner | +| deviceConfig.spec.testRunner.image | string | `"docker.io/rocm/test-runner:v1.4.0"` | test runner image | +| deviceConfig.spec.testRunner.imagePullPolicy | string | `"IfNotPresent"` | test runner image pull policy | +| deviceConfig.spec.testRunner.imageRegistrySecret | object | `{}` | test runner image pull secret | +| deviceConfig.spec.testRunner.logsLocation.hostPath | string | `"/var/log/amd-test-runner"` | host directory to save test run logs | +| deviceConfig.spec.testRunner.logsLocation.logsExportSecrets | list | `[]` | a list of secrets that contain connectivity info to multiple cloud providers | +| deviceConfig.spec.testRunner.logsLocation.mountPath | string | `"/var/log/amd-test-runner"` | test runner internal mounted directory to save test run logs | +| deviceConfig.spec.testRunner.selector | object | `{}` | test runner node selector, if not specified it will reuse spec.selector | +| deviceConfig.spec.testRunner.tolerations | list | `[]` | test runner tolerations | +| deviceConfig.spec.testRunner.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process | +| deviceConfig.spec.testRunner.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | +| installdefaultNFDRule | bool | `true` | Default NFD rule will detect amd gpu based on pci vendor ID | +| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator | +| node-feature-discovery.enabled | bool | `true` | Set to true/false to enable/disable the installation of node feature discovery (NFD) operator | +| node-feature-discovery.worker.nodeSelector | object | `{}` | Set nodeSelector for NFD worker daemonset | +| node-feature-discovery.worker.tolerations | list | `[{"effect":"NoExecute","key":"amd-dcm","operator":"Equal","value":"up"}]` | Set tolerations for NFD worker daemonset | +| upgradeCRD | bool | `true` | CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart | +| kmm.controller.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | Affinity for the KMM controller manager deployment | +| kmm.controller.manager.args[0] | string | `"--config=controller_config.yaml"` | | +| kmm.controller.manager.containerSecurityContext.allowPrivilegeEscalation | bool | `false` | | +| kmm.controller.manager.env.relatedImageBuild | string | `"gcr.io/kaniko-project/executor:v1.23.2"` | KMM kaniko builder image for building driver image within cluster | +| kmm.controller.manager.env.relatedImageBuildPullSecret | string | `""` | Image pull secret name for pulling KMM kaniko builder image if registry needs credential to pull image | +| kmm.controller.manager.env.relatedImageSign | string | `"docker.io/rocm/kernel-module-management-signimage:v1.4.0"` | KMM signer image for signing driver image's kernel module with given key pairs within cluster | +| kmm.controller.manager.env.relatedImageSignPullSecret | string | `""` | Image pull secret name for pulling KMM signer image if registry needs credential to pull image | +| kmm.controller.manager.env.relatedImageWorker | string | `"docker.io/rocm/kernel-module-management-worker:v1.4.0"` | KMM worker image for loading / unloading driver kernel module on worker nodes | +| kmm.controller.manager.env.relatedImageWorkerPullSecret | string | `""` | Image pull secret name for pulling KMM worker image if registry needs credential to pull image | +| kmm.controller.manager.image.repository | string | `"docker.io/rocm/kernel-module-management-operator"` | KMM controller manager image repository | +| kmm.controller.manager.image.tag | string | `"v1.4.0"` | KMM controller manager image tag | +| kmm.controller.manager.imagePullPolicy | string | `"Always"` | Image pull policy for KMM controller manager pod | +| kmm.controller.manager.imagePullSecrets | string | `""` | Image pull secret name for pulling KMM controller manager image if registry needs credential to pull image | +| kmm.controller.manager.resources.limits.cpu | string | `"500m"` | | +| kmm.controller.manager.resources.limits.memory | string | `"384Mi"` | | +| kmm.controller.manager.resources.requests.cpu | string | `"10m"` | | +| kmm.controller.manager.resources.requests.memory | string | `"64Mi"` | | +| kmm.controller.manager.tolerations[0].effect | string | `"NoSchedule"` | | +| kmm.controller.manager.tolerations[0].key | string | `"node-role.kubernetes.io/master"` | | +| kmm.controller.manager.tolerations[0].operator | string | `"Equal"` | | +| kmm.controller.manager.tolerations[0].value | string | `""` | | +| kmm.controller.manager.tolerations[1].effect | string | `"NoSchedule"` | | +| kmm.controller.manager.tolerations[1].key | string | `"node-role.kubernetes.io/control-plane"` | | +| kmm.controller.manager.tolerations[1].operator | string | `"Equal"` | | +| kmm.controller.manager.tolerations[1].value | string | `""` | | +| kmm.controller.nodeSelector | object | `{}` | Node selector for the KMM controller manager deployment | +| kmm.controller.replicas | int | `1` | | +| kmm.controller.serviceAccount.annotations | object | `{}` | | +| kmm.controllerMetricsService.ports[0].name | string | `"https"` | | +| kmm.controllerMetricsService.ports[0].port | int | `8443` | | +| kmm.controllerMetricsService.ports[0].protocol | string | `"TCP"` | | +| kmm.controllerMetricsService.ports[0].targetPort | string | `"https"` | | +| kmm.controllerMetricsService.type | string | `"ClusterIP"` | | +| kmm.kubernetesClusterDomain | string | `"cluster.local"` | | +| kmm.managerConfig.controllerConfigYaml | string | `"healthProbeBindAddress: :8081\nwebhookPort: 9443\nleaderElection:\n enabled: true\n resourceID: kmm.sigs.x-k8s.io\nmetrics:\n enableAuthnAuthz: true\n bindAddress: 0.0.0.0:8443\n secureServing: true\nworker:\n runAsUser: 0\n seLinuxType: spc_t\n firmwareHostPath: /var/lib/firmware"` | | +| kmm.webhookServer.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | KMM webhook's deployment affinity configs | +| kmm.webhookServer.nodeSelector | object | `{}` | KMM webhook's deployment node selector | +| kmm.webhookServer.replicas | int | `1` | | +| kmm.webhookServer.webhookServer.args[0] | string | `"--config=controller_config.yaml"` | | +| kmm.webhookServer.webhookServer.args[1] | string | `"--enable-module"` | | +| kmm.webhookServer.webhookServer.args[2] | string | `"--enable-namespace"` | | +| kmm.webhookServer.webhookServer.args[3] | string | `"--enable-preflightvalidation"` | | +| kmm.webhookServer.webhookServer.containerSecurityContext.allowPrivilegeEscalation | bool | `false` | | +| kmm.webhookServer.webhookServer.image.repository | string | `"docker.io/rocm/kernel-module-management-webhook-server"` | KMM webhook image repository | +| kmm.webhookServer.webhookServer.image.tag | string | `"v1.4.0"` | KMM webhook image tag | +| kmm.webhookServer.webhookServer.imagePullPolicy | string | `"Always"` | Image pull policy for KMM webhook pod | +| kmm.webhookServer.webhookServer.imagePullSecrets | string | `""` | Image pull secret name for pulling KMM webhook image if registry needs credential to pull image | +| kmm.webhookServer.webhookServer.resources.limits.cpu | string | `"500m"` | | +| kmm.webhookServer.webhookServer.resources.limits.memory | string | `"384Mi"` | | +| kmm.webhookServer.webhookServer.resources.requests.cpu | string | `"10m"` | | +| kmm.webhookServer.webhookServer.resources.requests.memory | string | `"64Mi"` | | +| kmm.webhookServer.webhookServer.tolerations[0].effect | string | `"NoSchedule"` | | +| kmm.webhookServer.webhookServer.tolerations[0].key | string | `"node-role.kubernetes.io/master"` | | +| kmm.webhookServer.webhookServer.tolerations[0].operator | string | `"Equal"` | | +| kmm.webhookServer.webhookServer.tolerations[0].value | string | `""` | | +| kmm.webhookServer.webhookServer.tolerations[1].effect | string | `"NoSchedule"` | | +| kmm.webhookServer.webhookServer.tolerations[1].key | string | `"node-role.kubernetes.io/control-plane"` | | +| kmm.webhookServer.webhookServer.tolerations[1].operator | string | `"Equal"` | | +| kmm.webhookServer.webhookServer.tolerations[1].value | string | `""` | | +| kmm.webhookService.ports[0].port | int | `443` | | +| kmm.webhookService.ports[0].protocol | string | `"TCP"` | | +| kmm.webhookService.ports[0].targetPort | int | `9443` | | +| kmm.webhookService.type | string | `"ClusterIP"` | | + diff --git a/charts/kmm/.helmignore b/charts/kmm/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/kmm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/kmm/Chart.yaml b/charts/kmm/Chart.yaml new file mode 100644 index 0000000..96ae1fd --- /dev/null +++ b/charts/kmm/Chart.yaml @@ -0,0 +1,7 @@ +apiVersion: v2 +appVersion: v20240618-v2.1.1 +description: A Helm chart for deploying Kernel Module Management for AMD GPU Operator +kubeVersion: '>= 1.18.0-0' +name: kmm +type: application +version: v1.0.0 diff --git a/charts/kmm/crds/module-crd.yaml b/charts/kmm/crds/module-crd.yaml new file mode 100644 index 0000000..bb7bcdb --- /dev/null +++ b/charts/kmm/crds/module-crd.yaml @@ -0,0 +1,2739 @@ +--- +# Source: kmm/templates/module-crd.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: modules.kmm.sigs.x-k8s.io + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + helm.sh/chart: kmm-v1.0.0 + app.kubernetes.io/name: kmm + app.kubernetes.io/instance: amd-gpu + app.kubernetes.io/version: "v20240618-v2.1.1" + app.kubernetes.io/managed-by: Helm +spec: + group: kmm.sigs.x-k8s.io + names: + kind: Module + listKind: ModuleList + plural: modules + singular: module + scope: Namespaced + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: Module describes how to load a module on different kernel versions + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ModuleSpec describes how the KMM operator should deploy a Module + on those nodes that need it. + properties: + devicePlugin: + description: |- + DevicePlugin allows overriding some properties of the container that deploys the device plugin on the node. + Name is ignored and is set automatically by the KMM Operator. + properties: + container: + properties: + args: + description: |- + Arguments to the entrypoint. + The container image's CMD is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell + items: + type: string + type: array + command: + description: |- + Entrypoint array. Not executed within a shell. + The container image's ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container's environment. If a variable + cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will + produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless + of whether the variable exists or not. Cannot be updated. + More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell + items: + type: string + type: array + env: + description: |- + List of environment variables to set in the container. + Cannot be updated. + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. Must be + a C_IDENTIFIER. + type: string + value: + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". + type: string + valueFrom: + description: Source for the environment variable's value. + Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or + its key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + fieldRef: + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the + specified API version. + type: string + required: + - fieldPath + type: object + x-kubernetes-map-type: atomic + resourceFieldRef: + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the + exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + x-kubernetes-map-type: atomic + secretKeyRef: + description: Selects a key of a secret in the pod's + namespace + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + required: + - name + type: object + type: array + image: + description: Image is the name of the container image that the + device plugin container will run. + type: string + imagePullPolicy: + description: |- + Image pull policy. + One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images + type: string + resources: + description: |- + Compute Resources required by this container. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + volumeMounts: + description: VolumeMounts is a list of volume mounts that are + appended to the default ones. + items: + description: VolumeMount describes a mounting of a Volume + within a container. + properties: + mountPath: + description: |- + Path within the container at which the volume should be mounted. Must + not contain ':'. + type: string + mountPropagation: + description: |- + mountPropagation determines how mounts are propagated from the host + to container and the other way around. + When not set, MountPropagationNone is used. + This field is beta in 1.10. + When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified + (which defaults to None). + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: |- + Mounted read-only if true, read-write otherwise (false or unspecified). + Defaults to false. + type: boolean + recursiveReadOnly: + description: |- + RecursiveReadOnly specifies whether read-only mounts should be handled + recursively. + + If ReadOnly is false, this field has no meaning and must be unspecified. + + If ReadOnly is true, and this field is set to Disabled, the mount is not made + recursively read-only. If this field is set to IfPossible, the mount is made + recursively read-only, if it is supported by the container runtime. If this + field is set to Enabled, the mount is made recursively read-only if it is + supported by the container runtime, otherwise the pod will not be started and + an error will be generated to indicate the reason. + + If this field is set to IfPossible or Enabled, MountPropagation must be set to + None (or be unspecified, which defaults to None). + + If this field is not specified, it is treated as an equivalent of Disabled. + type: string + subPath: + description: |- + Path within the volume from which the container's volume should be mounted. + Defaults to "" (volume's root). + type: string + subPathExpr: + description: |- + Expanded path within the volume from which the container's volume should be mounted. + Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. + Defaults to "" (volume's root). + SubPathExpr and SubPath are mutually exclusive. + type: string + required: + - mountPath + - name + type: object + type: array + required: + - image + type: object + serviceAccountName: + description: |- + ServiceAccountName is the name of the ServiceAccount to use to run this pod. + More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ + type: string + volumes: + items: + description: Volume represents a named volume in a pod that may + be accessed by any container in the pod. + properties: + awsElasticBlockStore: + description: |- + awsElasticBlockStore represents an AWS Disk resource that is attached to a + kubelet's host machine and then exposed to the pod. + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore + properties: + fsType: + description: |- + fsType is the filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore + type: string + partition: + description: |- + partition is the partition in the volume that you want to mount. + If omitted, the default is to mount by volume name. + Examples: For volume /dev/sda1, you specify the partition as "1". + Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty). + format: int32 + type: integer + readOnly: + description: |- + readOnly value true will force the readOnly setting in VolumeMounts. + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore + type: boolean + volumeID: + description: |- + volumeID is unique ID of the persistent disk resource in AWS (Amazon EBS volume). + More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore + type: string + required: + - volumeID + type: object + azureDisk: + description: azureDisk represents an Azure Data Disk mount + on the host and bind mount to the pod. + properties: + cachingMode: + description: 'cachingMode is the Host Caching mode: None, + Read Only, Read Write.' + type: string + diskName: + description: diskName is the Name of the data disk in + the blob storage + type: string + diskURI: + description: diskURI is the URI of data disk in the blob + storage + type: string + fsType: + default: ext4 + description: |- + fsType is Filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + kind: + description: 'kind expected values are Shared: multiple + blob disks per storage account Dedicated: single blob + disk per storage account Managed: azure managed data + disk (only in managed availability set). defaults to + shared' + type: string + readOnly: + default: false + description: |- + readOnly Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + type: boolean + required: + - diskName + - diskURI + type: object + azureFile: + description: azureFile represents an Azure File Service mount + on the host and bind mount to the pod. + properties: + readOnly: + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + type: boolean + secretName: + description: secretName is the name of secret that contains + Azure Storage Account Name and Key + type: string + shareName: + description: shareName is the azure share Name + type: string + required: + - secretName + - shareName + type: object + cephfs: + description: cephFS represents a Ceph FS mount on the host + that shares a pod's lifetime + properties: + monitors: + description: |- + monitors is Required: Monitors is a collection of Ceph monitors + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it + items: + type: string + type: array + x-kubernetes-list-type: atomic + path: + description: 'path is Optional: Used as the mounted root, + rather than the full Ceph tree, default is /' + type: string + readOnly: + description: |- + readOnly is Optional: Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it + type: boolean + secretFile: + description: |- + secretFile is Optional: SecretFile is the path to key ring for User, default is /etc/ceph/user.secret + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it + type: string + secretRef: + description: |- + secretRef is Optional: SecretRef is reference to the authentication secret for User, default is empty. + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + user: + description: |- + user is optional: User is the rados user name, default is admin + More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it + type: string + required: + - monitors + type: object + cinder: + description: |- + cinder represents a cinder volume attached and mounted on kubelets host machine. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md + properties: + fsType: + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md + type: string + readOnly: + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md + type: boolean + secretRef: + description: |- + secretRef is optional: points to a secret object containing parameters used to connect + to OpenStack. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + volumeID: + description: |- + volumeID used to identify the volume in cinder. + More info: https://examples.k8s.io/mysql-cinder-pd/README.md + type: string + required: + - volumeID + type: object + configMap: + description: configMap represents a configMap that should + populate this volume + properties: + defaultMode: + description: |- + defaultMode is optional: mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + Defaults to 0644. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + items: + description: |- + items if unspecified, each key-value pair in the Data field of the referenced + ConfigMap will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the ConfigMap, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within a volume. + properties: + key: + description: key is the key to project. + type: string + mode: + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + path: + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + x-kubernetes-list-type: atomic + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: optional specify whether the ConfigMap or + its keys must be defined + type: boolean + type: object + x-kubernetes-map-type: atomic + csi: + description: csi (Container Storage Interface) represents + ephemeral storage that is handled by certain external CSI + drivers (Beta feature). + properties: + driver: + description: |- + driver is the name of the CSI driver that handles this volume. + Consult with your admin for the correct name as registered in the cluster. + type: string + fsType: + description: |- + fsType to mount. Ex. "ext4", "xfs", "ntfs". + If not provided, the empty value is passed to the associated CSI driver + which will determine the default filesystem to apply. + type: string + nodePublishSecretRef: + description: |- + nodePublishSecretRef is a reference to the secret object containing + sensitive information to pass to the CSI driver to complete the CSI + NodePublishVolume and NodeUnpublishVolume calls. + This field is optional, and may be empty if no secret is required. If the + secret object contains more than one secret, all secret references are passed. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + readOnly: + description: |- + readOnly specifies a read-only configuration for the volume. + Defaults to false (read/write). + type: boolean + volumeAttributes: + additionalProperties: + type: string + description: |- + volumeAttributes stores driver-specific properties that are passed to the CSI + driver. Consult your driver's documentation for supported values. + type: object + required: + - driver + type: object + downwardAPI: + description: downwardAPI represents downward API about the + pod that should populate this volume + properties: + defaultMode: + description: |- + Optional: mode bits to use on created files by default. Must be a + Optional: mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + Defaults to 0644. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + items: + description: Items is a list of downward API volume file + items: + description: DownwardAPIVolumeFile represents information + to create the file containing the pod field + properties: + fieldRef: + description: 'Required: Selects a field of the pod: + only annotations, labels, name, namespace and + uid are supported.' + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in + the specified API version. + type: string + required: + - fieldPath + type: object + x-kubernetes-map-type: atomic + mode: + description: |- + Optional: mode bits used to set permissions on this file, must be an octal value + between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + path: + description: 'Required: Path is the relative path + name of the file to be created. Must not be absolute + or contain the ''..'' path. Must be utf-8 encoded. + The first item of the relative path must not start + with ''..''' + type: string + resourceFieldRef: + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of + the exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + x-kubernetes-map-type: atomic + required: + - path + type: object + type: array + x-kubernetes-list-type: atomic + type: object + emptyDir: + description: |- + emptyDir represents a temporary directory that shares a pod's lifetime. + More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir + properties: + medium: + description: |- + medium represents what type of storage medium should back this directory. + The default is "" which means to use the node's default medium. + Must be an empty string (default) or Memory. + More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir + type: string + sizeLimit: + anyOf: + - type: integer + - type: string + description: |- + sizeLimit is the total amount of local storage required for this EmptyDir volume. + The size limit is also applicable for memory medium. + The maximum usage on memory medium EmptyDir would be the minimum value between + the SizeLimit specified here and the sum of memory limits of all containers in a pod. + The default is nil which means that the limit is undefined. + More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + type: object + ephemeral: + description: |- + ephemeral represents a volume that is handled by a cluster storage driver. + The volume's lifecycle is tied to the pod that defines it - it will be created before the pod starts, + and deleted when the pod is removed. + + Use this if: + a) the volume is only needed while the pod runs, + b) features of normal volumes like restoring from snapshot or capacity + tracking are needed, + c) the storage driver is specified through a storage class, and + d) the storage driver supports dynamic volume provisioning through + a PersistentVolumeClaim (see EphemeralVolumeSource for more + information on the connection between this volume type + and PersistentVolumeClaim). + + Use PersistentVolumeClaim or one of the vendor-specific + APIs for volumes that persist for longer than the lifecycle + of an individual pod. + + Use CSI for light-weight local ephemeral volumes if the CSI driver is meant to + be used that way - see the documentation of the driver for + more information. + + A pod can use both types of ephemeral volumes and + persistent volumes at the same time. + properties: + volumeClaimTemplate: + description: |- + Will be used to create a stand-alone PVC to provision the volume. + The pod in which this EphemeralVolumeSource is embedded will be the + owner of the PVC, i.e. the PVC will be deleted together with the + pod. The name of the PVC will be `-` where + `` is the name from the `PodSpec.Volumes` array + entry. Pod validation will reject the pod if the concatenated name + is not valid for a PVC (for example, too long). + + An existing PVC with that name that is not owned by the pod + will *not* be used for the pod to avoid using an unrelated + volume by mistake. Starting the pod is then blocked until + the unrelated PVC is removed. If such a pre-created PVC is + meant to be used by the pod, the PVC has to updated with an + owner reference to the pod once the pod exists. Normally + this should not be necessary, but it may be useful when + manually reconstructing a broken cluster. + + This field is read-only and no changes will be made by Kubernetes + to the PVC after it has been created. + + Required, must not be nil. + properties: + metadata: + description: |- + May contain labels and annotations that will be copied into the PVC + when creating it. No other fields are allowed and will be rejected during + validation. + type: object + spec: + description: |- + The specification for the PersistentVolumeClaim. The entire content is + copied unchanged into the PVC that gets created from this + template. The same fields as in a PersistentVolumeClaim + are also valid here. + properties: + accessModes: + description: |- + accessModes contains the desired access modes the volume should have. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1 + items: + type: string + type: array + x-kubernetes-list-type: atomic + dataSource: + description: |- + dataSource field can be used to specify either: + * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) + * An existing PVC (PersistentVolumeClaim) + If the provisioner or an external controller can support the specified data source, + it will create a new volume based on the contents of the specified data source. + When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef, + and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified. + If the namespace is specified, then dataSourceRef will not be copied to dataSource. + properties: + apiGroup: + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource + being referenced + type: string + name: + description: Name is the name of resource + being referenced + type: string + required: + - kind + - name + type: object + x-kubernetes-map-type: atomic + dataSourceRef: + description: |- + dataSourceRef specifies the object from which to populate the volume with data, if a non-empty + volume is desired. This may be any object from a non-empty API group (non + core object) or a PersistentVolumeClaim object. + When this field is specified, volume binding will only succeed if the type of + the specified object matches some installed volume populator or dynamic + provisioner. + This field will replace the functionality of the dataSource field and as such + if both fields are non-empty, they must have the same value. For backwards + compatibility, when namespace isn't specified in dataSourceRef, + both fields (dataSource and dataSourceRef) will be set to the same + value automatically if one of them is empty and the other is non-empty. + When namespace is specified in dataSourceRef, + dataSource isn't set to the same value and must be empty. + There are three important differences between dataSource and dataSourceRef: + * While dataSource only allows two specific types of objects, dataSourceRef + allows any non-core object, as well as PersistentVolumeClaim objects. + * While dataSource ignores disallowed values (dropping them), dataSourceRef + preserves all values, and generates an error if a disallowed value is + specified. + * While dataSource only allows local objects, dataSourceRef allows objects + in any namespaces. + (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled. + (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled. + properties: + apiGroup: + description: |- + APIGroup is the group for the resource being referenced. + If APIGroup is not specified, the specified Kind must be in the core API group. + For any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource + being referenced + type: string + name: + description: Name is the name of resource + being referenced + type: string + namespace: + description: |- + Namespace is the namespace of resource being referenced + Note that when a namespace is specified, a gateway.networking.k8s.io/ReferenceGrant object is required in the referent namespace to allow that namespace's owner to accept the reference. See the ReferenceGrant documentation for details. + (Alpha) This field requires the CrossNamespaceVolumeDataSource feature gate to be enabled. + type: string + required: + - kind + - name + type: object + resources: + description: |- + resources represents the minimum resources the volume should have. + If RecoverVolumeExpansionFailure feature is enabled users are allowed to specify resource requirements + that are lower than previous value but must still be higher than capacity recorded in the + status field of the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + selector: + description: selector is a label query over volumes + to consider for binding. + properties: + matchExpressions: + description: matchExpressions is a list of + label selector requirements. The requirements + are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that + the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + storageClassName: + description: |- + storageClassName is the name of the StorageClass required by the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1 + type: string + volumeAttributesClassName: + description: |- + volumeAttributesClassName may be used to set the VolumeAttributesClass used by this claim. + If specified, the CSI driver will create or update the volume with the attributes defined + in the corresponding VolumeAttributesClass. This has a different purpose than storageClassName, + it can be changed after the claim is created. An empty string value means that no VolumeAttributesClass + will be applied to the claim but it's not allowed to reset this field to empty string once it is set. + If unspecified and the PersistentVolumeClaim is unbound, the default VolumeAttributesClass + will be set by the persistentvolume controller if it exists. + If the resource referred to by volumeAttributesClass does not exist, this PersistentVolumeClaim will be + set to a Pending state, as reflected by the modifyVolumeStatus field, until such as a resource + exists. + More info: https://kubernetes.io/docs/concepts/storage/volume-attributes-classes/ + (Beta) Using this field requires the VolumeAttributesClass feature gate to be enabled (off by default). + type: string + volumeMode: + description: |- + volumeMode defines what type of volume is required by the claim. + Value of Filesystem is implied when not included in claim spec. + type: string + volumeName: + description: volumeName is the binding reference + to the PersistentVolume backing this claim. + type: string + type: object + required: + - spec + type: object + type: object + fc: + description: fc represents a Fibre Channel resource that is + attached to a kubelet's host machine and then exposed to + the pod. + properties: + fsType: + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + lun: + description: 'lun is Optional: FC target lun number' + format: int32 + type: integer + readOnly: + description: |- + readOnly is Optional: Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + type: boolean + targetWWNs: + description: 'targetWWNs is Optional: FC target worldwide + names (WWNs)' + items: + type: string + type: array + x-kubernetes-list-type: atomic + wwids: + description: |- + wwids Optional: FC volume world wide identifiers (wwids) + Either wwids or combination of targetWWNs and lun must be set, but not both simultaneously. + items: + type: string + type: array + x-kubernetes-list-type: atomic + type: object + flexVolume: + description: |- + flexVolume represents a generic volume resource that is + provisioned/attached using an exec based plugin. + properties: + driver: + description: driver is the name of the driver to use for + this volume. + type: string + fsType: + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". The default filesystem depends on FlexVolume script. + type: string + options: + additionalProperties: + type: string + description: 'options is Optional: this field holds extra + command options if any.' + type: object + readOnly: + description: |- + readOnly is Optional: defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + type: boolean + secretRef: + description: |- + secretRef is Optional: secretRef is reference to the secret object containing + sensitive information to pass to the plugin scripts. This may be + empty if no secret object is specified. If the secret object + contains more than one secret, all secrets are passed to the plugin + scripts. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + required: + - driver + type: object + flocker: + description: flocker represents a Flocker volume attached + to a kubelet's host machine. This depends on the Flocker + control service being running + properties: + datasetName: + description: |- + datasetName is Name of the dataset stored as metadata -> name on the dataset for Flocker + should be considered as deprecated + type: string + datasetUUID: + description: datasetUUID is the UUID of the dataset. This + is unique identifier of a Flocker dataset + type: string + type: object + gcePersistentDisk: + description: |- + gcePersistentDisk represents a GCE Disk resource that is attached to a + kubelet's host machine and then exposed to the pod. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + properties: + fsType: + description: |- + fsType is filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + type: string + partition: + description: |- + partition is the partition in the volume that you want to mount. + If omitted, the default is to mount by volume name. + Examples: For volume /dev/sda1, you specify the partition as "1". + Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty). + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + format: int32 + type: integer + pdName: + description: |- + pdName is unique name of the PD resource in GCE. Used to identify the disk in GCE. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + type: string + readOnly: + description: |- + readOnly here will force the ReadOnly setting in VolumeMounts. + Defaults to false. + More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk + type: boolean + required: + - pdName + type: object + gitRepo: + description: |- + gitRepo represents a git repository at a particular revision. + DEPRECATED: GitRepo is deprecated. To provision a container with a git repo, mount an + EmptyDir into an InitContainer that clones the repo using git, then mount the EmptyDir + into the Pod's container. + properties: + directory: + description: |- + directory is the target directory name. + Must not contain or start with '..'. If '.' is supplied, the volume directory will be the + git repository. Otherwise, if specified, the volume will contain the git repository in + the subdirectory with the given name. + type: string + repository: + description: repository is the URL + type: string + revision: + description: revision is the commit hash for the specified + revision. + type: string + required: + - repository + type: object + glusterfs: + description: |- + glusterfs represents a Glusterfs mount on the host that shares a pod's lifetime. + More info: https://examples.k8s.io/volumes/glusterfs/README.md + properties: + endpoints: + description: |- + endpoints is the endpoint name that details Glusterfs topology. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod + type: string + path: + description: |- + path is the Glusterfs volume path. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod + type: string + readOnly: + description: |- + readOnly here will force the Glusterfs volume to be mounted with read-only permissions. + Defaults to false. + More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod + type: boolean + required: + - endpoints + - path + type: object + hostPath: + description: |- + hostPath represents a pre-existing file or directory on the host + machine that is directly exposed to the container. This is generally + used for system agents or other privileged things that are allowed + to see the host machine. Most containers will NOT need this. + More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath + properties: + path: + description: |- + path of the directory on the host. + If the path is a symlink, it will follow the link to the real path. + More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath + type: string + type: + description: |- + type for HostPath Volume + Defaults to "" + More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath + type: string + required: + - path + type: object + image: + description: |- + image represents an OCI object (a container image or artifact) pulled and mounted on the kubelet's host machine. + The volume is resolved at pod startup depending on which PullPolicy value is provided: + + - Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails. + - Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present. + - IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails. + + The volume gets re-resolved if the pod gets deleted and recreated, which means that new remote content will become available on pod recreation. + A failure to resolve or pull the image during pod startup will block containers from starting and may add significant latency. Failures will be retried using normal volume backoff and will be reported on the pod reason and message. + The types of objects that may be mounted by this volume are defined by the container runtime implementation on a host machine and at minimum must include all valid types supported by the container image field. + The OCI object gets mounted in a single directory (spec.containers[*].volumeMounts.mountPath) by merging the manifest layers in the same way as for container images. + The volume will be mounted read-only (ro) and non-executable files (noexec). + Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath). + The field spec.securityContext.fsGroupChangePolicy has no effect on this volume type. + properties: + pullPolicy: + description: |- + Policy for pulling OCI objects. Possible values are: + Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails. + Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present. + IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails. + Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. + type: string + reference: + description: |- + Required: Image or artifact reference to be used. + Behaves in the same way as pod.spec.containers[*].image. + Pull secrets will be assembled in the same way as for the container image by looking up node credentials, SA image pull secrets, and pod spec image pull secrets. + More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config management to default or override + container images in workload controllers like Deployments and StatefulSets. + type: string + type: object + iscsi: + description: |- + iscsi represents an ISCSI Disk resource that is attached to a + kubelet's host machine and then exposed to the pod. + More info: https://examples.k8s.io/volumes/iscsi/README.md + properties: + chapAuthDiscovery: + description: chapAuthDiscovery defines whether support + iSCSI Discovery CHAP authentication + type: boolean + chapAuthSession: + description: chapAuthSession defines whether support iSCSI + Session CHAP authentication + type: boolean + fsType: + description: |- + fsType is the filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi + type: string + initiatorName: + description: |- + initiatorName is the custom iSCSI Initiator Name. + If initiatorName is specified with iscsiInterface simultaneously, new iSCSI interface + : will be created for the connection. + type: string + iqn: + description: iqn is the target iSCSI Qualified Name. + type: string + iscsiInterface: + default: default + description: |- + iscsiInterface is the interface Name that uses an iSCSI transport. + Defaults to 'default' (tcp). + type: string + lun: + description: lun represents iSCSI Target Lun number. + format: int32 + type: integer + portals: + description: |- + portals is the iSCSI Target Portal List. The portal is either an IP or ip_addr:port if the port + is other than default (typically TCP ports 860 and 3260). + items: + type: string + type: array + x-kubernetes-list-type: atomic + readOnly: + description: |- + readOnly here will force the ReadOnly setting in VolumeMounts. + Defaults to false. + type: boolean + secretRef: + description: secretRef is the CHAP Secret for iSCSI target + and initiator authentication + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + targetPortal: + description: |- + targetPortal is iSCSI Target Portal. The Portal is either an IP or ip_addr:port if the port + is other than default (typically TCP ports 860 and 3260). + type: string + required: + - iqn + - lun + - targetPortal + type: object + name: + description: |- + name of the volume. + Must be a DNS_LABEL and unique within the pod. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + nfs: + description: |- + nfs represents an NFS mount on the host that shares a pod's lifetime + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs + properties: + path: + description: |- + path that is exported by the NFS server. + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs + type: string + readOnly: + description: |- + readOnly here will force the NFS export to be mounted with read-only permissions. + Defaults to false. + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs + type: boolean + server: + description: |- + server is the hostname or IP address of the NFS server. + More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs + type: string + required: + - path + - server + type: object + persistentVolumeClaim: + description: |- + persistentVolumeClaimVolumeSource represents a reference to a + PersistentVolumeClaim in the same namespace. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims + properties: + claimName: + description: |- + claimName is the name of a PersistentVolumeClaim in the same namespace as the pod using this volume. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims + type: string + readOnly: + description: |- + readOnly Will force the ReadOnly setting in VolumeMounts. + Default false. + type: boolean + required: + - claimName + type: object + photonPersistentDisk: + description: photonPersistentDisk represents a PhotonController + persistent disk attached and mounted on kubelets host machine + properties: + fsType: + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + pdID: + description: pdID is the ID that identifies Photon Controller + persistent disk + type: string + required: + - pdID + type: object + portworxVolume: + description: portworxVolume represents a portworx volume attached + and mounted on kubelets host machine + properties: + fsType: + description: |- + fSType represents the filesystem type to mount + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs". Implicitly inferred to be "ext4" if unspecified. + type: string + readOnly: + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + type: boolean + volumeID: + description: volumeID uniquely identifies a Portworx volume + type: string + required: + - volumeID + type: object + projected: + description: projected items for all in one resources secrets, + configmaps, and downward API + properties: + defaultMode: + description: |- + defaultMode are the mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + sources: + description: |- + sources is the list of volume projections. Each entry in this list + handles one source. + items: + description: |- + Projection that may be projected along with other supported volume types. + Exactly one of these fields must be set. + properties: + clusterTrustBundle: + description: |- + ClusterTrustBundle allows a pod to access the `.spec.trustBundle` field + of ClusterTrustBundle objects in an auto-updating file. + + Alpha, gated by the ClusterTrustBundleProjection feature gate. + + ClusterTrustBundle objects can either be selected by name, or by the + combination of signer name and a label selector. + + Kubelet performs aggressive normalization of the PEM contents written + into the pod filesystem. Esoteric PEM features such as inter-block + comments and block headers are stripped. Certificates are deduplicated. + The ordering of certificates within the file is arbitrary, and Kubelet + may change the order over time. + properties: + labelSelector: + description: |- + Select all ClusterTrustBundles that match this label selector. Only has + effect if signerName is set. Mutually-exclusive with name. If unset, + interpreted as "match nothing". If set but empty, interpreted as "match + everything". + properties: + matchExpressions: + description: matchExpressions is a list + of label selector requirements. The requirements + are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key + that the selector applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + name: + description: |- + Select a single ClusterTrustBundle by object name. Mutually-exclusive + with signerName and labelSelector. + type: string + optional: + description: |- + If true, don't block pod startup if the referenced ClusterTrustBundle(s) + aren't available. If using name, then the named ClusterTrustBundle is + allowed not to exist. If using signerName, then the combination of + signerName and labelSelector is allowed to match zero + ClusterTrustBundles. + type: boolean + path: + description: Relative path from the volume root + to write the bundle. + type: string + signerName: + description: |- + Select all ClusterTrustBundles that match this signer name. + Mutually-exclusive with name. The contents of all selected + ClusterTrustBundles will be unified and deduplicated. + type: string + required: + - path + type: object + configMap: + description: configMap information about the configMap + data to project + properties: + items: + description: |- + items if unspecified, each key-value pair in the Data field of the referenced + ConfigMap will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the ConfigMap, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within + a volume. + properties: + key: + description: key is the key to project. + type: string + mode: + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + path: + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + x-kubernetes-list-type: atomic + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: optional specify whether the ConfigMap + or its keys must be defined + type: boolean + type: object + x-kubernetes-map-type: atomic + downwardAPI: + description: downwardAPI information about the downwardAPI + data to project + properties: + items: + description: Items is a list of DownwardAPIVolume + file + items: + description: DownwardAPIVolumeFile represents + information to create the file containing + the pod field + properties: + fieldRef: + description: 'Required: Selects a field + of the pod: only annotations, labels, + name, namespace and uid are supported.' + properties: + apiVersion: + description: Version of the schema + the FieldPath is written in terms + of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to + select in the specified API version. + type: string + required: + - fieldPath + type: object + x-kubernetes-map-type: atomic + mode: + description: |- + Optional: mode bits used to set permissions on this file, must be an octal value + between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + path: + description: 'Required: Path is the relative + path name of the file to be created. + Must not be absolute or contain the + ''..'' path. Must be utf-8 encoded. + The first item of the relative path + must not start with ''..''' + type: string + resourceFieldRef: + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. + properties: + containerName: + description: 'Container name: required + for volumes, optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output + format of the exposed resources, + defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to + select' + type: string + required: + - resource + type: object + x-kubernetes-map-type: atomic + required: + - path + type: object + type: array + x-kubernetes-list-type: atomic + type: object + secret: + description: secret information about the secret + data to project + properties: + items: + description: |- + items if unspecified, each key-value pair in the Data field of the referenced + Secret will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the Secret, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within + a volume. + properties: + key: + description: key is the key to project. + type: string + mode: + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + path: + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + x-kubernetes-list-type: atomic + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: optional field specify whether + the Secret or its key must be defined + type: boolean + type: object + x-kubernetes-map-type: atomic + serviceAccountToken: + description: serviceAccountToken is information + about the serviceAccountToken data to project + properties: + audience: + description: |- + audience is the intended audience of the token. A recipient of a token + must identify itself with an identifier specified in the audience of the + token, and otherwise should reject the token. The audience defaults to the + identifier of the apiserver. + type: string + expirationSeconds: + description: |- + expirationSeconds is the requested duration of validity of the service + account token. As the token approaches expiration, the kubelet volume + plugin will proactively rotate the service account token. The kubelet will + start trying to rotate the token if the token is older than 80 percent of + its time to live or if the token is older than 24 hours.Defaults to 1 hour + and must be at least 10 minutes. + format: int64 + type: integer + path: + description: |- + path is the path relative to the mount point of the file to project the + token into. + type: string + required: + - path + type: object + type: object + type: array + x-kubernetes-list-type: atomic + type: object + quobyte: + description: quobyte represents a Quobyte mount on the host + that shares a pod's lifetime + properties: + group: + description: |- + group to map volume access to + Default is no group + type: string + readOnly: + description: |- + readOnly here will force the Quobyte volume to be mounted with read-only permissions. + Defaults to false. + type: boolean + registry: + description: |- + registry represents a single or multiple Quobyte Registry services + specified as a string as host:port pair (multiple entries are separated with commas) + which acts as the central registry for volumes + type: string + tenant: + description: |- + tenant owning the given Quobyte volume in the Backend + Used with dynamically provisioned Quobyte volumes, value is set by the plugin + type: string + user: + description: |- + user to map volume access to + Defaults to serivceaccount user + type: string + volume: + description: volume is a string that references an already + created Quobyte volume by name. + type: string + required: + - registry + - volume + type: object + rbd: + description: |- + rbd represents a Rados Block Device mount on the host that shares a pod's lifetime. + More info: https://examples.k8s.io/volumes/rbd/README.md + properties: + fsType: + description: |- + fsType is the filesystem type of the volume that you want to mount. + Tip: Ensure that the filesystem type is supported by the host operating system. + Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + More info: https://kubernetes.io/docs/concepts/storage/volumes#rbd + type: string + image: + description: |- + image is the rados image name. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it + type: string + keyring: + default: /etc/ceph/keyring + description: |- + keyring is the path to key ring for RBDUser. + Default is /etc/ceph/keyring. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it + type: string + monitors: + description: |- + monitors is a collection of Ceph monitors. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it + items: + type: string + type: array + x-kubernetes-list-type: atomic + pool: + default: rbd + description: |- + pool is the rados pool name. + Default is rbd. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it + type: string + readOnly: + description: |- + readOnly here will force the ReadOnly setting in VolumeMounts. + Defaults to false. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it + type: boolean + secretRef: + description: |- + secretRef is name of the authentication secret for RBDUser. If provided + overrides keyring. + Default is nil. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + user: + default: admin + description: |- + user is the rados user name. + Default is admin. + More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it + type: string + required: + - image + - monitors + type: object + scaleIO: + description: scaleIO represents a ScaleIO persistent volume + attached and mounted on Kubernetes nodes. + properties: + fsType: + default: xfs + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". + Default is "xfs". + type: string + gateway: + description: gateway is the host address of the ScaleIO + API Gateway. + type: string + protectionDomain: + description: protectionDomain is the name of the ScaleIO + Protection Domain for the configured storage. + type: string + readOnly: + description: |- + readOnly Defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + type: boolean + secretRef: + description: |- + secretRef references to the secret for ScaleIO user and other + sensitive information. If this is not provided, Login operation will fail. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + sslEnabled: + description: sslEnabled Flag enable/disable SSL communication + with Gateway, default false + type: boolean + storageMode: + default: ThinProvisioned + description: |- + storageMode indicates whether the storage for a volume should be ThickProvisioned or ThinProvisioned. + Default is ThinProvisioned. + type: string + storagePool: + description: storagePool is the ScaleIO Storage Pool associated + with the protection domain. + type: string + system: + description: system is the name of the storage system + as configured in ScaleIO. + type: string + volumeName: + description: |- + volumeName is the name of a volume already created in the ScaleIO system + that is associated with this volume source. + type: string + required: + - gateway + - secretRef + - system + type: object + secret: + description: |- + secret represents a secret that should populate this volume. + More info: https://kubernetes.io/docs/concepts/storage/volumes#secret + properties: + defaultMode: + description: |- + defaultMode is Optional: mode bits used to set permissions on created files by default. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values + for mode bits. Defaults to 0644. + Directories within the path are not affected by this setting. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + items: + description: |- + items If unspecified, each key-value pair in the Data field of the referenced + Secret will be projected into the volume as a file whose name is the + key and content is the value. If specified, the listed keys will be + projected into the specified paths, and unlisted keys will not be + present. If a key is specified which is not present in the Secret, + the volume setup will error unless it is marked optional. Paths must be + relative and may not contain the '..' path or start with '..'. + items: + description: Maps a string key to a path within a volume. + properties: + key: + description: key is the key to project. + type: string + mode: + description: |- + mode is Optional: mode bits used to set permissions on this file. + Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. + YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. + If not specified, the volume defaultMode will be used. + This might be in conflict with other options that affect the file + mode, like fsGroup, and the result can be other mode bits set. + format: int32 + type: integer + path: + description: |- + path is the relative path of the file to map the key to. + May not be an absolute path. + May not contain the path element '..'. + May not start with the string '..'. + type: string + required: + - key + - path + type: object + type: array + x-kubernetes-list-type: atomic + optional: + description: optional field specify whether the Secret + or its keys must be defined + type: boolean + secretName: + description: |- + secretName is the name of the secret in the pod's namespace to use. + More info: https://kubernetes.io/docs/concepts/storage/volumes#secret + type: string + type: object + storageos: + description: storageOS represents a StorageOS volume attached + and mounted on Kubernetes nodes. + properties: + fsType: + description: |- + fsType is the filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + readOnly: + description: |- + readOnly defaults to false (read/write). ReadOnly here will force + the ReadOnly setting in VolumeMounts. + type: boolean + secretRef: + description: |- + secretRef specifies the secret to use for obtaining the StorageOS API + credentials. If not specified, default values will be attempted. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + volumeName: + description: |- + volumeName is the human-readable name of the StorageOS volume. Volume + names are only unique within a namespace. + type: string + volumeNamespace: + description: |- + volumeNamespace specifies the scope of the volume within StorageOS. If no + namespace is specified then the Pod's namespace will be used. This allows the + Kubernetes name scoping to be mirrored within StorageOS for tighter integration. + Set VolumeName to any name to override the default behaviour. + Set to "default" if you are not using namespaces within StorageOS. + Namespaces that do not pre-exist within StorageOS will be created. + type: string + type: object + vsphereVolume: + description: vsphereVolume represents a vSphere volume attached + and mounted on kubelets host machine + properties: + fsType: + description: |- + fsType is filesystem type to mount. + Must be a filesystem type supported by the host operating system. + Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. + type: string + storagePolicyID: + description: storagePolicyID is the storage Policy Based + Management (SPBM) profile ID associated with the StoragePolicyName. + type: string + storagePolicyName: + description: storagePolicyName is the storage Policy Based + Management (SPBM) profile name. + type: string + volumePath: + description: volumePath is the path that identifies vSphere + volume vmdk + type: string + required: + - volumePath + type: object + required: + - name + type: object + type: array + required: + - container + type: object + imageRepoSecret: + description: |- + ImageRepoSecret is an optional secret that is used to pull both the module loader and the device plugin, and + to push the resulting image from the module loader build, if enabled. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + moduleLoader: + description: |- + ModuleLoader allows overriding some properties of the container that loads the kernel module on the node. + Name and image are ignored and are set automatically by the KMM Operator. + properties: + container: + description: Container holds the properties for the module loader + container that runs modprobe. + properties: + build: + description: Build contains build instructions. + properties: + baseImageRegistryTLS: + description: BaseImageRegistryTLS contains settings determining + how to access registries of the base images in the build-process' + Dockerfile. + properties: + insecure: + description: If Insecure is true, the operator will + be able to access a registry in an insecure (plain + HTTP) protocol. + type: boolean + insecureSkipTLSVerify: + description: If InsecureSkipTLSVerify, the operator + will accept any certificate provided by the registry. + type: boolean + type: object + buildArgs: + description: BuildArgs is an array of build variables that + are provided to the image building backend. + items: + description: BuildArg represents a build argument used + when building a container image. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + dockerfileConfigMap: + description: ConfigMap that holds Dockerfile contents + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + kanikoParams: + description: KanikoParams is used to customize the building + process of the image. + properties: + tag: + description: Kaniko image tag to use when creating the + build Pod + type: string + type: object + secrets: + description: |- + Secrets is an optional list of secrets to be made available to the build system. + Those secrets should be used for private resources such as a private Github repo. + For container registries auth use module.spec.imagePullSecret instead. + items: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: array + selector: + additionalProperties: + type: string + description: Selector describes on which nodes will run + the building process. + type: object + required: + - dockerfileConfigMap + type: object + containerImage: + description: ContainerImage is a top-level field + type: string + imagePullPolicy: + description: |- + Image pull policy. + One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. + Cannot be updated. + More info: https://kubernetes.io/docs/concepts/containers/images#updating-images + type: string + inTreeModuleToRemove: + description: |- + Deprecated: please use InTreeModulesToRemove. + InTreeModuleToRemove specifies one in-tree kernel module that should be removed (if present) + before loading the kernel module from the ContainerImage + type: string + inTreeModulesToRemove: + description: |- + InTreeModulesToRemove specifies any number of in-tree kernel modules that should be removed (if present) + before loading the kernel module from the ContainerImage + items: + type: string + type: array + kernelMappings: + description: |- + KernelMappings is a list of kernel mappings. + When a node's labels match Selector, then the KMM Operator will look for the first mapping that matches its + kernel version, and use the corresponding container image to run the DriverContainer. + items: + description: |- + KernelMapping pairs kernel versions with a DriverContainer image. + Kernel versions can be matched literally or using a regular expression. + properties: + build: + description: Build enables in-cluster builds for this + mapping and allows overriding the Module's build settings. + properties: + baseImageRegistryTLS: + description: BaseImageRegistryTLS contains settings + determining how to access registries of the base + images in the build-process' Dockerfile. + properties: + insecure: + description: If Insecure is true, the operator + will be able to access a registry in an insecure + (plain HTTP) protocol. + type: boolean + insecureSkipTLSVerify: + description: If InsecureSkipTLSVerify, the operator + will accept any certificate provided by the + registry. + type: boolean + type: object + buildArgs: + description: BuildArgs is an array of build variables + that are provided to the image building backend. + items: + description: BuildArg represents a build argument + used when building a container image. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + dockerfileConfigMap: + description: ConfigMap that holds Dockerfile contents + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + kanikoParams: + description: KanikoParams is used to customize the + building process of the image. + properties: + tag: + description: Kaniko image tag to use when creating + the build Pod + type: string + type: object + secrets: + description: |- + Secrets is an optional list of secrets to be made available to the build system. + Those secrets should be used for private resources such as a private Github repo. + For container registries auth use module.spec.imagePullSecret instead. + items: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: array + selector: + additionalProperties: + type: string + description: Selector describes on which nodes will + run the building process. + type: object + required: + - dockerfileConfigMap + type: object + containerImage: + description: ContainerImage is the name of the DriverContainer + image that should be used to deploy the module. + type: string + inTreeModuleToRemove: + description: |- + Deprecated: please use InTreeModulesToRemove. + InTreeModuleToRemove specifies one in-tree kernel module that should be removed (if present) + before loading the kernel module from the ContainerImage + type: string + inTreeModulesToRemove: + description: |- + InTreeModulesToRemove specifies any number of in-tree kernel modules that should be removed (if present) + before loading the kernel module from the ContainerImage + items: + type: string + type: array + literal: + description: Literal defines a literal target kernel version + to be matched exactly against node kernels. + type: string + regexp: + description: Regexp is a regular expression to be match + against node kernels. + type: string + registryTLS: + description: RegistryTLS set the TLS configs for accessing + the registry of the module-loader's image. + properties: + insecure: + description: If Insecure is true, the operator will + be able to access a registry in an insecure (plain + HTTP) protocol. + type: boolean + insecureSkipTLSVerify: + description: If InsecureSkipTLSVerify, the operator + will accept any certificate provided by the registry. + type: boolean + type: object + sign: + description: Sign enables in-cluster signing for this + mapping + properties: + certSecret: + description: a secret containing the public key used + to sign kernel modules for secureboot + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + filesToSign: + description: paths inside the image for the kernel + modules to sign (if ommited all kmods are signed) + items: + type: string + type: array + keySecret: + description: a secret containing the private key used + to sign kernel modules for secureboot + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + unsignedImage: + description: Image to sign, ignored if a Build is + present, required otherwise + type: string + unsignedImageRegistryTLS: + description: UnsignedImageRegistryTLS contains settings + determining how to access registries of the unsigned + image. + properties: + insecure: + description: If Insecure is true, the operator + will be able to access a registry in an insecure + (plain HTTP) protocol. + type: boolean + insecureSkipTLSVerify: + description: If InsecureSkipTLSVerify, the operator + will accept any certificate provided by the + registry. + type: boolean + type: object + required: + - certSecret + - keySecret + type: object + required: + - containerImage + type: object + minItems: 1 + type: array + modprobe: + description: Modprobe is a set of properties to customize which + module modprobe loads and with which properties. + properties: + args: + description: |- + Args is an optional list of arguments to be passed to modprobe before the name of the kernel module. + The resulting commands will be: `modprobe ${Args} module_name`. + properties: + load: + description: Load is an optional list of arguments to + be used when loading the kernel module. + items: + type: string + minItems: 1 + type: array + unload: + description: Unload is an optional list of arguments + to be used when unloading the kernel module. + items: + type: string + minItems: 1 + type: array + type: object + dirName: + default: /opt + description: |- + DirName is the root directory for modules. + It adds `-d ${DirName}` to the modprobe command-line. + type: string + firmwarePath: + description: |- + FirmwarePath is the path of the firmware(s). + The firmware(s) will be copied to the host for the kernel to find them. + type: string + moduleName: + description: |- + ModuleName is the name of the Module to be loaded. + This field can only be unset if rawArgs is set. + type: string + modulesLoadingOrder: + description: |- + ModulesLoadingOrder defines the dependency between kernel modules loading, in case + it was not created by depmod (independent kernel modules). + The list order should be: upmost module, then the module it depends on and so on. + Example: if moduleA depends on first loading moduleB, and moduleB depends on first loading moduleC + the entry should look: + ModulesLoadingOrder: + - moduleA + - moduleB + - moduleC + In order to load all 3 modules, moduleA shoud be defined in the ModuleName parameter of this struct + items: + type: string + type: array + parameters: + description: |- + Parameters is an optional list of kernel module parameters to be provided to modprobe. + They should be in the form of key=value and will be separated by spaces in the modprobe command. + The resulting loading command will be: `modprobe module_name ${Parameters}`. + items: + type: string + type: array + rawArgs: + description: |- + If RawArgs are specified, they are passed straight to the modprobe binary; all other properties in this + object are ignored. + The resulting commands will be: `modprobe ${RawArgs}`. + properties: + load: + description: Load is an optional list of arguments to + be used when loading the kernel module. + items: + type: string + minItems: 1 + type: array + unload: + description: Unload is an optional list of arguments + to be used when unloading the kernel module. + items: + type: string + minItems: 1 + type: array + type: object + type: object + registryTLS: + description: RegistryTLS set the TLS configs for accessing the + registry of the module-loader's image. + properties: + insecure: + description: If Insecure is true, the operator will be able + to access a registry in an insecure (plain HTTP) protocol. + type: boolean + insecureSkipTLSVerify: + description: If InsecureSkipTLSVerify, the operator will + accept any certificate provided by the registry. + type: boolean + type: object + sign: + description: Sign provides default kmod signing settings + properties: + certSecret: + description: a secret containing the public key used to + sign kernel modules for secureboot + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + filesToSign: + description: paths inside the image for the kernel modules + to sign (if ommited all kmods are signed) + items: + type: string + type: array + keySecret: + description: a secret containing the private key used to + sign kernel modules for secureboot + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + unsignedImage: + description: Image to sign, ignored if a Build is present, + required otherwise + type: string + unsignedImageRegistryTLS: + description: UnsignedImageRegistryTLS contains settings + determining how to access registries of the unsigned image. + properties: + insecure: + description: If Insecure is true, the operator will + be able to access a registry in an insecure (plain + HTTP) protocol. + type: boolean + insecureSkipTLSVerify: + description: If InsecureSkipTLSVerify, the operator + will accept any certificate provided by the registry. + type: boolean + type: object + required: + - certSecret + - keySecret + type: object + version: + description: |- + Version defines the current version of the kernel module being used + Used for upgrading the currently loaded kernel module to a new version + type: string + required: + - kernelMappings + - modprobe + type: object + serviceAccountName: + description: |- + ServiceAccountName is the name of the ServiceAccount to use to run this pod. + More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ + type: string + required: + - container + type: object + selector: + additionalProperties: + type: string + description: Selector describes on which nodes the Module should be + loaded and optionally built. + type: object + tolerations: + description: If specified, the pod's tolerations. + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + required: + - moduleLoader + - selector + type: object + status: + description: ModuleStatus defines the observed state of Module. + properties: + devicePlugin: + description: |- + DevicePlugin contains the status of the Device Plugin daemonset + if it was deployed during reconciliation + properties: + availableNumber: + description: number of the actually deployed and running pods + format: int32 + type: integer + desiredNumber: + description: number of the pods that should be deployed for daemonset + format: int32 + type: integer + nodesMatchingSelectorNumber: + description: number of nodes that are targeted by the module selector + format: int32 + type: integer + type: object + moduleLoader: + description: ModuleLoader contains the status of the ModuleLoader daemonset + properties: + availableNumber: + description: number of the actually deployed and running pods + format: int32 + type: integer + desiredNumber: + description: number of the pods that should be deployed for daemonset + format: int32 + type: integer + nodesMatchingSelectorNumber: + description: number of nodes that are targeted by the module selector + format: int32 + type: integer + type: object + required: + - moduleLoader + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/charts/kmm/crds/nodemodulesconfig-crd.yaml b/charts/kmm/crds/nodemodulesconfig-crd.yaml new file mode 100644 index 0000000..85977e6 --- /dev/null +++ b/charts/kmm/crds/nodemodulesconfig-crd.yaml @@ -0,0 +1,449 @@ +--- +# Source: kmm/templates/nodemodulesconfig-crd.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: nodemodulesconfigs.kmm.sigs.x-k8s.io + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + helm.sh/chart: kmm-v1.0.0 + app.kubernetes.io/name: kmm + app.kubernetes.io/instance: amd-gpu + app.kubernetes.io/version: "v20240618-v2.1.1" + app.kubernetes.io/managed-by: Helm +spec: + group: kmm.sigs.x-k8s.io + names: + kind: NodeModulesConfig + listKind: NodeModulesConfigList + plural: nodemodulesconfigs + shortNames: + - nmc + singular: nodemodulesconfig + scope: Cluster + versions: + - name: v1beta1 + schema: + openAPIV3Schema: + description: NodeModulesConfig keeps spec and state of the KMM modules on a + node. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + NodeModulesConfigSpec describes the desired state of modules on the node + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + modules: + description: |- + Modules list the spec of all the modules that need to be executed + on the node + items: + properties: + config: + properties: + containerImage: + type: string + imagePullPolicy: + default: IfNotPresent + description: PullPolicy describes a policy for if/when to + pull a container image + type: string + inTreeModuleToRemove: + type: string + inTreeModulesToRemove: + items: + type: string + type: array + insecurePull: + description: When InsecurePull is true, the container image + can be pulled without TLS. + type: boolean + kernelVersion: + type: string + modprobe: + properties: + args: + description: |- + Args is an optional list of arguments to be passed to modprobe before the name of the kernel module. + The resulting commands will be: `modprobe ${Args} module_name`. + properties: + load: + description: Load is an optional list of arguments + to be used when loading the kernel module. + items: + type: string + minItems: 1 + type: array + unload: + description: Unload is an optional list of arguments + to be used when unloading the kernel module. + items: + type: string + minItems: 1 + type: array + type: object + dirName: + default: /opt + description: |- + DirName is the root directory for modules. + It adds `-d ${DirName}` to the modprobe command-line. + type: string + firmwarePath: + description: |- + FirmwarePath is the path of the firmware(s). + The firmware(s) will be copied to the host for the kernel to find them. + type: string + moduleName: + description: |- + ModuleName is the name of the Module to be loaded. + This field can only be unset if rawArgs is set. + type: string + modulesLoadingOrder: + description: |- + ModulesLoadingOrder defines the dependency between kernel modules loading, in case + it was not created by depmod (independent kernel modules). + The list order should be: upmost module, then the module it depends on and so on. + Example: if moduleA depends on first loading moduleB, and moduleB depends on first loading moduleC + the entry should look: + ModulesLoadingOrder: + - moduleA + - moduleB + - moduleC + In order to load all 3 modules, moduleA shoud be defined in the ModuleName parameter of this struct + items: + type: string + type: array + parameters: + description: |- + Parameters is an optional list of kernel module parameters to be provided to modprobe. + They should be in the form of key=value and will be separated by spaces in the modprobe command. + The resulting loading command will be: `modprobe module_name ${Parameters}`. + items: + type: string + type: array + rawArgs: + description: |- + If RawArgs are specified, they are passed straight to the modprobe binary; all other properties in this + object are ignored. + The resulting commands will be: `modprobe ${RawArgs}`. + properties: + load: + description: Load is an optional list of arguments + to be used when loading the kernel module. + items: + type: string + minItems: 1 + type: array + unload: + description: Unload is an optional list of arguments + to be used when unloading the kernel module. + items: + type: string + minItems: 1 + type: array + type: object + type: object + required: + - containerImage + - imagePullPolicy + - insecurePull + - kernelVersion + - modprobe + type: object + imageRepoSecret: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + name: + type: string + namespace: + type: string + serviceAccountName: + type: string + tolerations: + description: tolerations define which tolerations should be added + for every load/unload pod running on the node + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + required: + - config + - name + - namespace + - serviceAccountName + type: object + type: array + type: object + status: + description: |- + NodeModuleConfigStatus is the most recently observed status of the KMM modules on node. + It is populated by the system and is read-only. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + modules: + description: Modules contain observations about each Module's node state + status + items: + properties: + bootId: + type: string + config: + properties: + containerImage: + type: string + imagePullPolicy: + default: IfNotPresent + description: PullPolicy describes a policy for if/when to + pull a container image + type: string + inTreeModuleToRemove: + type: string + inTreeModulesToRemove: + items: + type: string + type: array + insecurePull: + description: When InsecurePull is true, the container image + can be pulled without TLS. + type: boolean + kernelVersion: + type: string + modprobe: + properties: + args: + description: |- + Args is an optional list of arguments to be passed to modprobe before the name of the kernel module. + The resulting commands will be: `modprobe ${Args} module_name`. + properties: + load: + description: Load is an optional list of arguments + to be used when loading the kernel module. + items: + type: string + minItems: 1 + type: array + unload: + description: Unload is an optional list of arguments + to be used when unloading the kernel module. + items: + type: string + minItems: 1 + type: array + type: object + dirName: + default: /opt + description: |- + DirName is the root directory for modules. + It adds `-d ${DirName}` to the modprobe command-line. + type: string + firmwarePath: + description: |- + FirmwarePath is the path of the firmware(s). + The firmware(s) will be copied to the host for the kernel to find them. + type: string + moduleName: + description: |- + ModuleName is the name of the Module to be loaded. + This field can only be unset if rawArgs is set. + type: string + modulesLoadingOrder: + description: |- + ModulesLoadingOrder defines the dependency between kernel modules loading, in case + it was not created by depmod (independent kernel modules). + The list order should be: upmost module, then the module it depends on and so on. + Example: if moduleA depends on first loading moduleB, and moduleB depends on first loading moduleC + the entry should look: + ModulesLoadingOrder: + - moduleA + - moduleB + - moduleC + In order to load all 3 modules, moduleA shoud be defined in the ModuleName parameter of this struct + items: + type: string + type: array + parameters: + description: |- + Parameters is an optional list of kernel module parameters to be provided to modprobe. + They should be in the form of key=value and will be separated by spaces in the modprobe command. + The resulting loading command will be: `modprobe module_name ${Parameters}`. + items: + type: string + type: array + rawArgs: + description: |- + If RawArgs are specified, they are passed straight to the modprobe binary; all other properties in this + object are ignored. + The resulting commands will be: `modprobe ${RawArgs}`. + properties: + load: + description: Load is an optional list of arguments + to be used when loading the kernel module. + items: + type: string + minItems: 1 + type: array + unload: + description: Unload is an optional list of arguments + to be used when unloading the kernel module. + items: + type: string + minItems: 1 + type: array + type: object + type: object + required: + - containerImage + - imagePullPolicy + - insecurePull + - kernelVersion + - modprobe + type: object + imageRepoSecret: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + lastTransitionTime: + format: date-time + type: string + name: + type: string + namespace: + type: string + serviceAccountName: + type: string + tolerations: + description: tolerations define which tolerations should be added + for every load/unload pod running on the node + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + required: + - name + - namespace + - serviceAccountName + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/charts/kmm/templates/_helpers.tpl b/charts/kmm/templates/_helpers.tpl new file mode 100644 index 0000000..1826415 --- /dev/null +++ b/charts/kmm/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "kmm.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "kmm.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "kmm.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "kmm.labels" -}} +helm.sh/chart: {{ include "kmm.chart" . }} +{{ include "kmm.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "kmm.selectorLabels" -}} +app.kubernetes.io/name: {{ include "kmm.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "kmm.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "kmm.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/kmm/templates/controller-metrics-service.yaml b/charts/kmm/templates/controller-metrics-service.yaml new file mode 100644 index 0000000..4f17b47 --- /dev/null +++ b/charts/kmm/templates/controller-metrics-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kmm.fullname" . }}-controller-metrics-service + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: controller + {{- include "kmm.labels" . | nindent 4 }} +spec: + type: {{ .Values.controllerMetricsService.type }} + selector: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: controller + {{- include "kmm.selectorLabels" . | nindent 4 }} + ports: + {{- .Values.controllerMetricsService.ports | toYaml | nindent 2 }} diff --git a/charts/kmm/templates/deployment.yaml b/charts/kmm/templates/deployment.yaml new file mode 100644 index 0000000..c7be70b --- /dev/null +++ b/charts/kmm/templates/deployment.yaml @@ -0,0 +1,203 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "kmm.fullname" . }}-controller + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: controller + {{- include "kmm.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.controller.replicas }} + selector: + matchLabels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: controller + {{- include "kmm.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: controller + {{- include "kmm.selectorLabels" . | nindent 8 }} + annotations: + kubectl.kubernetes.io/default-container: manager + spec: + {{- with .Values.controller.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + nodeSelector: {{- toYaml .Values.controller.nodeSelector | nindent 8 }} + containers: + - args: {{- toYaml .Values.controller.manager.args | nindent 8 }} + env: + - name: RELATED_IMAGE_WORKER + value: {{ quote .Values.controller.manager.env.relatedImageWorker }} + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: RELATED_IMAGE_BUILD + value: {{ quote .Values.controller.manager.env.relatedImageBuild }} + - name: RELATED_IMAGE_SIGN + value: {{ quote .Values.controller.manager.env.relatedImageSign }} + - name: KUBERNETES_CLUSTER_DOMAIN + value: {{ quote .Values.kubernetesClusterDomain }} + {{- if .Values.controller.manager.env.relatedImageBuildPullSecret }} + - name: RELATED_IMAGE_BUILD_PULL_SECRET + value: {{ .Values.controller.manager.env.relatedImageBuildPullSecret }} + {{- end}} + {{- if .Values.controller.manager.env.relatedImageSignPullSecret }} + - name: RELATED_IMAGE_SIGN_PULL_SECRET + value: {{ .Values.controller.manager.env.relatedImageSignPullSecret }} + {{- end}} + {{- if .Values.controller.manager.env.relatedImageWorkerPullSecret }} + - name: RELATED_IMAGE_WORKER_PULL_SECRET + value: {{ .Values.controller.manager.env.relatedImageWorkerPullSecret }} + {{- end}} + {{- if .Values.global.proxy.env | default dict}} + {{- range $key, $value := .Values.global.proxy.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- end }} + image: {{ .Values.controller.manager.image.repository }}:{{ .Values.controller.manager.image.tag + | default .Chart.AppVersion }} + imagePullPolicy: {{ .Values.controller.manager.imagePullPolicy }} + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + ports: + - containerPort: 8443 + name: metrics + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {{- toYaml .Values.controller.manager.resources | nindent 10 }} + securityContext: {{- toYaml .Values.controller.manager.containerSecurityContext + | nindent 10 }} + volumeMounts: + - mountPath: /controller_config.yaml + name: manager-config + subPath: controller_config.yaml + {{- if .Values.controller.manager.imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.controller.manager.imagePullSecrets }} + {{- end}} + securityContext: + runAsNonRoot: true + serviceAccountName: {{ include "kmm.fullname" . }}-controller + terminationGracePeriodSeconds: 10 + {{- with .Values.controller.manager.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - configMap: + name: {{ include "kmm.fullname" . }}-manager-config + name: manager-config +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "kmm.fullname" . }}-webhook-server + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: webhook-server + {{- include "kmm.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.webhookServer.replicas }} + selector: + matchLabels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: webhook-server + {{- include "kmm.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: webhook-server + {{- include "kmm.selectorLabels" . | nindent 8 }} + annotations: + kubectl.kubernetes.io/default-container: webhook-server + spec: + {{- with .Values.webhookServer.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + nodeSelector: {{- toYaml .Values.webhookServer.nodeSelector | nindent 8 }} + containers: + - args: {{- toYaml .Values.webhookServer.webhookServer.args | nindent 8 }} + env: + - name: KUBERNETES_CLUSTER_DOMAIN + value: {{ quote .Values.kubernetesClusterDomain }} + {{- if .Values.global.proxy.env | default dict}} + {{- range $key, $value := .Values.global.proxy.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- end }} + image: {{ .Values.webhookServer.webhookServer.image.repository }}:{{ .Values.webhookServer.webhookServer.image.tag + | default .Chart.AppVersion }} + imagePullPolicy: {{ .Values.webhookServer.webhookServer.imagePullPolicy }} + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: webhook-server + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {{- toYaml .Values.webhookServer.webhookServer.resources | nindent 10 + }} + securityContext: {{- toYaml .Values.webhookServer.webhookServer.containerSecurityContext + | nindent 10 }} + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: cert + readOnly: true + - mountPath: /controller_config.yaml + name: manager-config + subPath: controller_config.yaml + {{- if .Values.webhookServer.webhookServer.imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.webhookServer.webhookServer.imagePullSecrets }} + {{- end}} + securityContext: + runAsNonRoot: true + serviceAccountName: {{ include "kmm.fullname" . }}-controller + terminationGracePeriodSeconds: 10 + {{- with .Values.webhookServer.webhookServer.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: cert + secret: + defaultMode: 420 + secretName: kmm-operator-webhook-server-cert + - configMap: + name: {{ include "kmm.fullname" . }}-manager-config + name: manager-config diff --git a/charts/kmm/templates/event-recorder-clusterrole-rbac.yaml b/charts/kmm/templates/event-recorder-clusterrole-rbac.yaml new file mode 100644 index 0000000..6d86d62 --- /dev/null +++ b/charts/kmm/templates/event-recorder-clusterrole-rbac.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kmm.fullname" . }}-event-recorder-clusterrole + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch \ No newline at end of file diff --git a/charts/kmm/templates/event-recorder-clusterrolebinding-rbac.yaml b/charts/kmm/templates/event-recorder-clusterrolebinding-rbac.yaml new file mode 100644 index 0000000..2136610 --- /dev/null +++ b/charts/kmm/templates/event-recorder-clusterrolebinding-rbac.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "kmm.fullname" . }}-event-recorder-clusterrolebinding + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "kmm.fullname" . }}-event-recorder-clusterrole' +subjects: +- kind: ServiceAccount + name: '{{ include "kmm.fullname" . }}-controller' + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/charts/kmm/templates/leader-election-rbac.yaml b/charts/kmm/templates/leader-election-rbac.yaml new file mode 100644 index 0000000..d4b7df6 --- /dev/null +++ b/charts/kmm/templates/leader-election-rbac.yaml @@ -0,0 +1,50 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "kmm.fullname" . }}-leader-election-role + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "kmm.fullname" . }}-leader-election-rolebinding + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: '{{ include "kmm.fullname" . }}-leader-election-role' +subjects: +- kind: ServiceAccount + name: '{{ include "kmm.fullname" . }}-controller' + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/charts/kmm/templates/manager-config.yaml b/charts/kmm/templates/manager-config.yaml new file mode 100644 index 0000000..27f3a71 --- /dev/null +++ b/charts/kmm/templates/manager-config.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "kmm.fullname" . }}-manager-config + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +data: + controller_config.yaml: {{ .Values.managerConfig.controllerConfigYaml | toYaml + | indent 1 }} \ No newline at end of file diff --git a/charts/kmm/templates/manager-rbac.yaml b/charts/kmm/templates/manager-rbac.yaml new file mode 100644 index 0000000..677acd6 --- /dev/null +++ b/charts/kmm/templates/manager-rbac.yaml @@ -0,0 +1,135 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kmm.fullname" . }}-manager-role + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +rules: +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - cluster.open-cluster-management.io + resources: + - clusterclaims + verbs: + - create + - get + - list + - watch +- apiGroups: + - cluster.open-cluster-management.io + resourceNames: + - kernel-versions.kmm.node.kubernetes.io + resources: + - clusterclaims + verbs: + - delete + - patch + - update +- apiGroups: + - "" + resources: + - configmaps + - secrets + - serviceaccounts + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - namespaces + - nodes + verbs: + - get + - list + - patch + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - modules + verbs: + - get + - list + - patch + - update + - watch +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - modules/status + - preflightvalidations/status + verbs: + - get + - patch + - update +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - nodemodulesconfigs + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - nodemodulesconfigs/status + verbs: + - patch +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - preflightvalidations + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "kmm.fullname" . }}-manager-rolebinding + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "kmm.fullname" . }}-manager-role' +subjects: +- kind: ServiceAccount + name: '{{ include "kmm.fullname" . }}-controller' + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/charts/kmm/templates/metrics-reader-rbac.yaml b/charts/kmm/templates/metrics-reader-rbac.yaml new file mode 100644 index 0000000..2acb712 --- /dev/null +++ b/charts/kmm/templates/metrics-reader-rbac.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kmm.fullname" . }}-metrics-reader + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +rules: +- nonResourceURLs: + - /metrics + verbs: + - get \ No newline at end of file diff --git a/charts/kmm/templates/preflightvalidation-crd.yaml b/charts/kmm/templates/preflightvalidation-crd.yaml new file mode 100644 index 0000000..6da0c1c --- /dev/null +++ b/charts/kmm/templates/preflightvalidation-crd.yaml @@ -0,0 +1,243 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: preflightvalidations.kmm.sigs.x-k8s.io + annotations: + cert-manager.io/inject-ca-from: '{{ .Release.Namespace }}/{{ include "kmm.fullname" + . }}-serving-cert' + controller-gen.kubebuilder.io/version: v0.16.1 + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + name: '{{ include "kmm.fullname" . }}-webhook-service' + namespace: '{{ .Release.Namespace }}' + path: /convert + conversionReviewVersions: + - v1beta2 + - v1beta1 + group: kmm.sigs.x-k8s.io + names: + kind: PreflightValidation + listKind: PreflightValidationList + plural: preflightvalidations + shortNames: + - pfv + singular: preflightvalidation + scope: Cluster + versions: + - deprecated: true + name: v1beta1 + schema: + openAPIV3Schema: + description: PreflightValidation initiates a preflight validations for all Modules + on the current Kubernetes cluster. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + PreflightValidationSpec describes the desired state of the resource, such as the kernel version + that Module CRs need to be verified against as well as the debug configuration of the logs + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + kernelVersion: + description: KernelVersion describes the kernel image that all Modules + need to be checked against. + type: string + pushBuiltImage: + description: |- + Boolean flag that determines whether images build during preflight must also + be pushed to a defined repository + type: boolean + required: + - kernelVersion + type: object + status: + description: |- + PreflightValidationStatus is the most recently observed status of the PreflightValidation. + It is populated by the system and is read-only. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + crStatuses: + additionalProperties: + properties: + lastTransitionTime: + description: |- + LastTransitionTime is the last time the CR status transitioned from one status to another. + This should be when the underlying status changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + statusReason: + description: StatusReason contains a string describing the status + source. + type: string + verificationStage: + description: |- + Current stage of the verification process: + image (image existence verification), build(build process verification) + enum: + - Image + - Build + - Sign + - Requeued + - Done + type: string + verificationStatus: + description: |- + Status of Module CR verification: true (verified), false (verification failed), + error (error during verification process), unknown (verification has not started yet) + enum: + - "True" + - "False" + type: string + required: + - lastTransitionTime + - verificationStage + - verificationStatus + type: object + description: CRStatuses contain observations about each Module's preflight + upgradability validation + type: object + type: object + required: + - spec + type: object + served: true + storage: false + subresources: + status: {} + - name: v1beta2 + schema: + openAPIV3Schema: + description: PreflightValidation initiates a preflight validations for all Modules + on the current Kubernetes cluster. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + PreflightValidationSpec describes the desired state of the resource, such as the kernel version + that Module CRs need to be verified against as well as the debug configuration of the logs + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + kernelVersion: + description: KernelVersion describes the kernel image that all Modules + need to be checked against. + type: string + pushBuiltImage: + description: |- + Boolean flag that determines whether images build during preflight must also + be pushed to a defined repository + type: boolean + required: + - kernelVersion + type: object + status: + description: |- + PreflightValidationStatus is the most recently observed status of the PreflightValidation. + It is populated by the system and is read-only. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status + properties: + modules: + description: Modules contain observations about each Module's preflight + upgradability validation + items: + properties: + lastTransitionTime: + description: |- + LastTransitionTime is the last time the CR status transitioned from one status to another. + This should be when the underlying status changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + name: + description: Name is the name of the Module resource. + type: string + namespace: + description: Namespace is the namespace of the Module resource. + type: string + statusReason: + description: StatusReason contains a string describing the status + source. + type: string + verificationStage: + description: |- + Current stage of the verification process: + image (image existence verification), build(build process verification) + enum: + - Image + - Build + - Sign + - Requeued + - Done + type: string + verificationStatus: + description: |- + Status of Module CR verification: true (verified), false (verification failed), + error (error during verification process), unknown (verification has not started yet) + enum: + - "True" + - "False" + type: string + required: + - lastTransitionTime + - name + - namespace + - verificationStage + - verificationStatus + type: object + type: array + x-kubernetes-list-map-keys: + - namespace + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] \ No newline at end of file diff --git a/charts/kmm/templates/proxy-rbac.yaml b/charts/kmm/templates/proxy-rbac.yaml new file mode 100644 index 0000000..6cc30bb --- /dev/null +++ b/charts/kmm/templates/proxy-rbac.yaml @@ -0,0 +1,38 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kmm.fullname" . }}-proxy-role + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "kmm.fullname" . }}-proxy-rolebinding + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "kmm.fullname" . }}-proxy-role' +subjects: +- kind: ServiceAccount + name: '{{ include "kmm.fullname" . }}-controller' + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/charts/kmm/templates/selfsigned-issuer.yaml b/charts/kmm/templates/selfsigned-issuer.yaml new file mode 100644 index 0000000..f3c128e --- /dev/null +++ b/charts/kmm/templates/selfsigned-issuer.yaml @@ -0,0 +1,8 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: {{ include "kmm.fullname" . }}-selfsigned-issuer + labels: + {{- include "kmm.labels" . | nindent 4 }} +spec: + selfSigned: {} \ No newline at end of file diff --git a/charts/kmm/templates/serviceaccount.yaml b/charts/kmm/templates/serviceaccount.yaml new file mode 100644 index 0000000..f581e45 --- /dev/null +++ b/charts/kmm/templates/serviceaccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "kmm.fullname" . }}-controller + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.controller.serviceAccount.annotations | nindent 4 }} \ No newline at end of file diff --git a/charts/kmm/templates/serving-cert.yaml b/charts/kmm/templates/serving-cert.yaml new file mode 100644 index 0000000..fed75c4 --- /dev/null +++ b/charts/kmm/templates/serving-cert.yaml @@ -0,0 +1,15 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: {{ include "kmm.fullname" . }}-serving-cert + labels: + {{- include "kmm.labels" . | nindent 4 }} +spec: + dnsNames: + - '{{ include "kmm.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc' + - '{{ include "kmm.fullname" . }}-webhook-service.{{ .Release.Namespace }}.svc.{{ + .Values.kubernetesClusterDomain }}' + issuerRef: + kind: Issuer + name: '{{ include "kmm.fullname" . }}-selfsigned-issuer' + secretName: kmm-operator-webhook-server-cert \ No newline at end of file diff --git a/charts/kmm/templates/validating-webhook-configuration.yaml b/charts/kmm/templates/validating-webhook-configuration.yaml new file mode 100644 index 0000000..10e02d4 --- /dev/null +++ b/charts/kmm/templates/validating-webhook-configuration.yaml @@ -0,0 +1,51 @@ +apiVersion: admissionregistration.k8s.io/v1 +kind: ValidatingWebhookConfiguration +metadata: + name: {{ include "kmm.fullname" . }}-validating-webhook-configuration + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ include "kmm.fullname" . }}-serving-cert + labels: + {{- include "kmm.labels" . | nindent 4 }} +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: '{{ include "kmm.fullname" . }}-webhook-service' + namespace: '{{ .Release.Namespace }}' + path: /validate--v1-namespace + failurePolicy: Fail + name: namespace-deletion.kmm.sigs.k8s.io + namespaceSelector: + matchLabels: + kmm.node.k8s.io/contains-modules: "" + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - DELETE + resources: + - namespaces + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: '{{ include "kmm.fullname" . }}-webhook-service' + namespace: '{{ .Release.Namespace }}' + path: /validate-kmm-sigs-x-k8s-io-v1beta1-module + failurePolicy: Fail + name: vmodule.kb.io + rules: + - apiGroups: + - kmm.sigs.x-k8s.io + apiVersions: + - v1beta1 + operations: + - CREATE + - UPDATE + resources: + - modules + sideEffects: None \ No newline at end of file diff --git a/charts/kmm/templates/webhook-service.yaml b/charts/kmm/templates/webhook-service.yaml new file mode 100644 index 0000000..90c7547 --- /dev/null +++ b/charts/kmm/templates/webhook-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "kmm.fullname" . }}-webhook-service + labels: + app.kubernetes.io/component: kmm + app.kubernetes.io/created-by: kernel-module-management + app.kubernetes.io/part-of: kmm + {{- include "kmm.labels" . | nindent 4 }} +spec: + type: {{ .Values.webhookService.type }} + selector: + app.kubernetes.io/component: kmm + app.kubernetes.io/part-of: kmm + control-plane: webhook-server + {{- include "kmm.selectorLabels" . | nindent 4 }} + ports: + {{- .Values.webhookService.ports | toYaml | nindent 2 }} diff --git a/charts/kmm/values.yaml b/charts/kmm/values.yaml new file mode 100644 index 0000000..0e5fc15 --- /dev/null +++ b/charts/kmm/values.yaml @@ -0,0 +1,133 @@ +controller: + manager: + args: + - --config=controller_config.yaml + containerSecurityContext: + allowPrivilegeEscalation: false + env: + # -- KMM kaniko builder image for building driver image within cluster + relatedImageBuild: gcr.io/kaniko-project/executor:v1.23.2 + # -- KMM signer image for signing driver image's kernel module with given key pairs within cluster + relatedImageSign: docker.io/rocm/kernel-module-management-signimage:v1.4.0 + # -- KMM worker image for loading / unloading driver kernel module on worker nodes + relatedImageWorker: docker.io/rocm/kernel-module-management-worker:v1.4.0 + # -- Image pull secret name for pulling KMM kaniko builder image if registry needs credential to pull image + relatedImageBuildPullSecret: "" + # -- Image pull secret name for pulling KMM signer image if registry needs credential to pull image + relatedImageSignPullSecret: "" + # -- Image pull secret name for pulling KMM worker image if registry needs credential to pull image + relatedImageWorkerPullSecret: "" + image: + # -- KMM controller manager image repository + repository: docker.io/rocm/kernel-module-management-operator + # -- KMM controller manager image tag + tag: v1.4.0 + # -- Image pull policy for KMM controller manager pod + imagePullPolicy: Always + # -- Image pull secret name for pulling KMM controller manager image if registry needs credential to pull image + imagePullSecrets: "" + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + resources: + limits: + cpu: 500m + memory: 384Mi + requests: + cpu: 10m + memory: 64Mi + # -- Node selector for the KMM controller manager deployment + nodeSelector: {} + # -- Affinity for the KMM controller manager deployment + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + replicas: 1 + serviceAccount: + annotations: {} +controllerMetricsService: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + type: ClusterIP +kubernetesClusterDomain: cluster.local +managerConfig: + controllerConfigYaml: |- + healthProbeBindAddress: :8081 + webhookPort: 9443 + leaderElection: + enabled: true + resourceID: kmm.sigs.x-k8s.io + metrics: + enableAuthnAuthz: true + bindAddress: 0.0.0.0:8443 + secureServing: true + worker: + runAsUser: 0 + seLinuxType: spc_t + firmwareHostPath: /var/lib/firmware +webhookServer: + replicas: 1 + # -- KMM webhook's deployment node selector + nodeSelector: {} + # -- KMM webhook's deployment affinity configs + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + webhookServer: + args: + - --config=controller_config.yaml + - --enable-module + - --enable-namespace + - --enable-preflightvalidation + containerSecurityContext: + allowPrivilegeEscalation: false + image: + # -- KMM webhook image repository + repository: docker.io/rocm/kernel-module-management-webhook-server + # -- KMM webhook image tag + tag: v1.4.0 + # -- Image pull policy for KMM webhook pod + imagePullPolicy: Always + # -- Image pull secret name for pulling KMM webhook image if registry needs credential to pull image + imagePullSecrets: "" + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + resources: + limits: + cpu: 500m + memory: 384Mi + requests: + cpu: 10m + memory: 64Mi +webhookService: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + type: ClusterIP diff --git a/charts/node-feature-discovery/.helmignore b/charts/node-feature-discovery/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/node-feature-discovery/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/node-feature-discovery/Chart.yaml b/charts/node-feature-discovery/Chart.yaml new file mode 100644 index 0000000..771e3fe --- /dev/null +++ b/charts/node-feature-discovery/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +appVersion: v0.16.1 +description: 'Detects hardware features available on each node in a Kubernetes cluster, + and advertises those features using node labels. ' +home: https://github.com/kubernetes-sigs/node-feature-discovery +keywords: +- feature-discovery +- feature-detection +- node-labels +name: node-feature-discovery +sources: +- https://github.com/kubernetes-sigs/node-feature-discovery +type: application +version: 0.16.1 diff --git a/charts/node-feature-discovery/README.md b/charts/node-feature-discovery/README.md new file mode 100644 index 0000000..93734f8 --- /dev/null +++ b/charts/node-feature-discovery/README.md @@ -0,0 +1,10 @@ +# Node Feature Discovery + +Node Feature Discovery (NFD) is a Kubernetes add-on for detecting hardware +features and system configuration. Detected features are advertised as node +labels. NFD provides flexible configuration and extension points for a wide +range of vendor and application specific node labeling needs. + +See +[NFD documentation](https://kubernetes-sigs.github.io/node-feature-discovery/v0.16/deployment/helm.html) +for deployment instructions. diff --git a/charts/node-feature-discovery/crds/nfd-api-crds.yaml b/charts/node-feature-discovery/crds/nfd-api-crds.yaml new file mode 100644 index 0000000..0a73c5d --- /dev/null +++ b/charts/node-feature-discovery/crds/nfd-api-crds.yaml @@ -0,0 +1,710 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodefeatures.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeature + listKind: NodeFeatureList + plural: nodefeatures + singular: nodefeature + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + NodeFeature resource holds the features discovered for one node in the + cluster. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Specification of the NodeFeature, containing features discovered + for a node. + properties: + features: + description: Features is the full "raw" features data that has been + discovered. + properties: + attributes: + additionalProperties: + description: AttributeFeatureSet is a set of features having + string value. + properties: + elements: + additionalProperties: + type: string + description: Individual features of the feature set. + type: object + required: + - elements + type: object + description: Attributes contains all the attribute-type features + of the node. + type: object + flags: + additionalProperties: + description: FlagFeatureSet is a set of simple features only + containing names without values. + properties: + elements: + additionalProperties: + description: Nil is a dummy empty struct for protobuf + compatibility + type: object + description: Individual features of the feature set. + type: object + required: + - elements + type: object + description: Flags contains all the flag-type features of the + node. + type: object + instances: + additionalProperties: + description: InstanceFeatureSet is a set of features each of + which is an instance having multiple attributes. + properties: + elements: + description: Individual features of the feature set. + items: + description: InstanceFeature represents one instance of + a complex features, e.g. a device. + properties: + attributes: + additionalProperties: + type: string + description: Attributes of the instance feature. + type: object + required: + - attributes + type: object + type: array + required: + - elements + type: object + description: Instances contains all the instance-type features + of the node. + type: object + type: object + labels: + additionalProperties: + type: string + description: Labels is the set of node labels that are requested to + be created. + type: object + type: object + required: + - spec + type: object + served: true + storage: true +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodefeaturegroups.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeatureGroup + listKind: NodeFeatureGroupList + plural: nodefeaturegroups + shortNames: + - nfg + singular: nodefeaturegroup + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeFeatureGroup resource holds Node pools by featureGroup + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the rules to be evaluated. + properties: + featureGroupRules: + description: List of rules to evaluate to determine nodes that belong + in this group. + items: + description: GroupRule defines a rule for nodegroup filtering. + properties: + matchAny: + description: MatchAny specifies a list of matchers one of which + must match. + items: + description: MatchAnyElem specifies one sub-matcher of MatchAny. + properties: + matchFeatures: + description: MatchFeatures specifies a set of matcher + terms all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature + set to match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + required: + - matchFeatures + type: object + type: array + matchFeatures: + description: MatchFeatures specifies a set of matcher terms + all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature set to + match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + name: + description: Name of the rule. + type: string + required: + - name + type: object + type: array + required: + - featureGroupRules + type: object + status: + description: |- + Status of the NodeFeatureGroup after the most recent evaluation of the + specification. + properties: + nodes: + description: Nodes is a list of FeatureGroupNode in the cluster that + match the featureGroupRules + items: + properties: + name: + description: Name of the node. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: nodefeaturerules.nfd.k8s-sigs.io +spec: + group: nfd.k8s-sigs.io + names: + kind: NodeFeatureRule + listKind: NodeFeatureRuleList + plural: nodefeaturerules + shortNames: + - nfr + singular: nodefeaturerule + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + NodeFeatureRule resource specifies a configuration for feature-based + customization of node objects, such as node labeling. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Spec defines the rules to be evaluated. + properties: + rules: + description: Rules is a list of node customization rules. + items: + description: Rule defines a rule for node customization such as + labeling. + properties: + annotations: + additionalProperties: + type: string + description: Annotations to create if the rule matches. + type: object + extendedResources: + additionalProperties: + type: string + description: ExtendedResources to create if the rule matches. + type: object + labels: + additionalProperties: + type: string + description: Labels to create if the rule matches. + type: object + labelsTemplate: + description: |- + LabelsTemplate specifies a template to expand for dynamically generating + multiple labels. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. + type: string + matchAny: + description: MatchAny specifies a list of matchers one of which + must match. + items: + description: MatchAnyElem specifies one sub-matcher of MatchAny. + properties: + matchFeatures: + description: MatchFeatures specifies a set of matcher + terms all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature + set to match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + required: + - matchFeatures + type: object + type: array + matchFeatures: + description: MatchFeatures specifies a set of matcher terms + all of which must match. + items: + description: |- + FeatureMatcherTerm defines requirements against one feature set. All + requirements (specified as MatchExpressions) are evaluated against each + element in the feature set. + properties: + feature: + description: Feature is the name of the feature set to + match against. + type: string + matchExpressions: + additionalProperties: + description: |- + MatchExpression specifies an expression to evaluate against a set of input + values. It contains an operator that is applied when matching the input and + an array of values that the operator evaluates the input against. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + description: |- + MatchExpressions is the set of per-element expressions evaluated. These + match against the value of the specified elements. + type: object + matchName: + description: |- + MatchName in an expression that is matched against the name of each + element in the feature set. + properties: + op: + description: Op is the operator to be applied. + enum: + - In + - NotIn + - InRegexp + - Exists + - DoesNotExist + - Gt + - Lt + - GtLt + - IsTrue + - IsFalse + type: string + value: + description: |- + Value is the list of values that the operand evaluates the input + against. Value should be empty if the operator is Exists, DoesNotExist, + IsTrue or IsFalse. Value should contain exactly one element if the + operator is Gt or Lt and exactly two elements if the operator is GtLt. + In other cases Value should contain at least one element. + items: + type: string + type: array + required: + - op + type: object + required: + - feature + type: object + type: array + name: + description: Name of the rule. + type: string + taints: + description: Taints to create if the rule matches. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to + a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint + key. + type: string + required: + - effect + - key + type: object + type: array + vars: + additionalProperties: + type: string + description: |- + Vars is the variables to store if the rule matches. Variables do not + directly inflict any changes in the node object. However, they can be + referenced from other rules enabling more complex rule hierarchies, + without exposing intermediary output values as labels. + type: object + varsTemplate: + description: |- + VarsTemplate specifies a template to expand for dynamically generating + multiple variables. Data (after template expansion) must be keys with an + optional value ([=]) separated by newlines. + type: string + required: + - name + type: object + type: array + required: + - rules + type: object + required: + - spec + type: object + served: true + storage: true diff --git a/charts/node-feature-discovery/templates/_helpers.tpl b/charts/node-feature-discovery/templates/_helpers.tpl new file mode 100644 index 0000000..928ece7 --- /dev/null +++ b/charts/node-feature-discovery/templates/_helpers.tpl @@ -0,0 +1,107 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "node-feature-discovery.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "node-feature-discovery.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Allow the release namespace to be overridden for multi-namespace deployments in combined charts +*/}} +{{- define "node-feature-discovery.namespace" -}} + {{- if .Values.namespaceOverride -}} + {{- .Values.namespaceOverride -}} + {{- else -}} + {{- .Release.Namespace -}} + {{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "node-feature-discovery.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "node-feature-discovery.labels" -}} +helm.sh/chart: {{ include "node-feature-discovery.chart" . }} +{{ include "node-feature-discovery.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "node-feature-discovery.selectorLabels" -}} +app.kubernetes.io/name: {{ include "node-feature-discovery.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Create the name of the service account which the nfd master will use +*/}} +{{- define "node-feature-discovery.master.serviceAccountName" -}} +{{- if .Values.master.serviceAccount.create -}} + {{ default (include "node-feature-discovery.fullname" .) .Values.master.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.master.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Create the name of the service account which the nfd worker will use +*/}} +{{- define "node-feature-discovery.worker.serviceAccountName" -}} +{{- if .Values.worker.serviceAccount.create -}} + {{ default (printf "%s-worker" (include "node-feature-discovery.fullname" .)) .Values.worker.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.worker.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Create the name of the service account which topologyUpdater will use +*/}} +{{- define "node-feature-discovery.topologyUpdater.serviceAccountName" -}} +{{- if .Values.topologyUpdater.serviceAccount.create -}} + {{ default (printf "%s-topology-updater" (include "node-feature-discovery.fullname" .)) .Values.topologyUpdater.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.topologyUpdater.serviceAccount.name }} +{{- end -}} +{{- end -}} + +{{/* +Create the name of the service account which nfd-gc will use +*/}} +{{- define "node-feature-discovery.gc.serviceAccountName" -}} +{{- if .Values.gc.serviceAccount.create -}} + {{ default (printf "%s-gc" (include "node-feature-discovery.fullname" .)) .Values.gc.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.gc.serviceAccount.name }} +{{- end -}} +{{- end -}} diff --git a/charts/node-feature-discovery/templates/cert-manager-certs.yaml b/charts/node-feature-discovery/templates/cert-manager-certs.yaml new file mode 100644 index 0000000..2d15760 --- /dev/null +++ b/charts/node-feature-discovery/templates/cert-manager-certs.yaml @@ -0,0 +1,80 @@ +{{- if .Values.tls.certManager }} +{{- if .Values.master.enable }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-master-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + secretName: nfd-master-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-master + dnsNames: + # must match the service name + - {{ include "node-feature-discovery.fullname" . }}-master + # first one is configured for use by the worker; below are for completeness + - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc + - {{ include "node-feature-discovery.fullname" . }}-master.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local + issuerRef: + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} + kind: Issuer + {{- end }} + group: cert-manager.io +{{- end }} +--- +{{- if .Values.worker.enable }} +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-worker-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + secretName: nfd-worker-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-worker + dnsNames: + - {{ include "node-feature-discovery.fullname" . }}-worker.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local + issuerRef: + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} + kind: Issuer + {{- end }} + group: cert-manager.io +{{- end }} + +{{- if .Values.topologyUpdater.enable }} +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-topology-updater-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + secretName: nfd-topology-updater-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-topology-updater + dnsNames: + - {{ include "node-feature-discovery.fullname" . }}-topology-updater.{{ include "node-feature-discovery.namespace" . }}.svc.cluster.local + issuerRef: + name: {{ default "nfd-ca-issuer" .Values.tls.certManagerCertificate.issuerName }} + {{- if and .Values.tls.certManagerCertificate.issuerName .Values.tls.certManagerCertificate.issuerKind }} + kind: {{ .Values.tls.certManagerCertificate.issuerKind }} + {{- else }} + kind: Issuer + {{- end }} + group: cert-manager.io +{{- end }} + +{{- end }} diff --git a/charts/node-feature-discovery/templates/cert-manager-issuer.yaml b/charts/node-feature-discovery/templates/cert-manager-issuer.yaml new file mode 100644 index 0000000..8744689 --- /dev/null +++ b/charts/node-feature-discovery/templates/cert-manager-issuer.yaml @@ -0,0 +1,42 @@ +{{- if and .Values.tls.certManager (not .Values.tls.certManagerCertificate.issuerName ) }} +# See https://cert-manager.io/docs/configuration/selfsigned/#bootstrapping-ca-issuers +# - Create a self signed issuer +# - Use this to create a CA cert +# - Use this to now create a CA issuer +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: nfd-ca-bootstrap + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + selfSigned: {} + +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: nfd-ca-cert + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + isCA: true + secretName: nfd-ca-cert + subject: + organizations: + - node-feature-discovery + commonName: nfd-ca-cert + issuerRef: + name: nfd-ca-bootstrap + kind: Issuer + group: cert-manager.io + +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: nfd-ca-issuer + namespace: {{ include "node-feature-discovery.namespace" . }} +spec: + ca: + secretName: nfd-ca-cert +{{- end }} diff --git a/charts/node-feature-discovery/templates/clusterrole.yaml b/charts/node-feature-discovery/templates/clusterrole.yaml new file mode 100644 index 0000000..f935cfe --- /dev/null +++ b/charts/node-feature-discovery/templates/clusterrole.yaml @@ -0,0 +1,133 @@ +{{- if and .Values.master.enable .Values.master.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - get + - patch + - update + - list +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + - nodefeaturerules + - nodefeaturegroups + verbs: + - get + - list + - watch +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeaturegroups/status + verbs: + - patch + - update +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create +- apiGroups: + - coordination.k8s.io + resources: + - leases + resourceNames: + - "nfd-master.nfd.kubernetes.io" + verbs: + - get + - update +{{- end }} + +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get +- apiGroups: + - "" + resources: + - nodes/proxy + verbs: + - get +- apiGroups: + - "" + resources: + - pods + verbs: + - get +- apiGroups: + - topology.node.k8s.io + resources: + - noderesourcetopologies + verbs: + - create + - get + - update +{{- end }} + +{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-gc + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - list + - watch +- apiGroups: + - "" + resources: + - nodes/proxy + verbs: + - get +- apiGroups: + - topology.node.k8s.io + resources: + - noderesourcetopologies + verbs: + - delete + - list +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - delete + - list +{{- end }} diff --git a/charts/node-feature-discovery/templates/clusterrolebinding.yaml b/charts/node-feature-discovery/templates/clusterrolebinding.yaml new file mode 100644 index 0000000..3f71798 --- /dev/null +++ b/charts/node-feature-discovery/templates/clusterrolebinding.yaml @@ -0,0 +1,52 @@ +{{- if and .Values.master.enable .Values.master.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }} +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.master.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} + +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.rbac.create }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} + +{{- if and .Values.gc.enable .Values.gc.rbac.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-gc + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }}-gc +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.gc.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/master.yaml b/charts/node-feature-discovery/templates/master.yaml new file mode 100644 index 0000000..3a58420 --- /dev/null +++ b/charts/node-feature-discovery/templates/master.yaml @@ -0,0 +1,151 @@ +{{- if .Values.master.enable }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-master + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: master + {{- with .Values.master.deploymentAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.master.replicaCount }} + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: master + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: master + {{- with .Values.master.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "node-feature-discovery.master.serviceAccountName" . }} + enableServiceLinks: false + securityContext: + {{- toYaml .Values.master.podSecurityContext | nindent 8 }} + containers: + - name: master + securityContext: + {{- toYaml .Values.master.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 10 + ports: + - containerPort: {{ .Values.master.port | default "8080" }} + name: grpc + - containerPort: {{ .Values.master.metricsPort | default "8081" }} + name: metrics + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - "nfd-master" + resources: + {{- toYaml .Values.master.resources | nindent 12 }} + args: + {{- if .Values.master.instance | empty | not }} + - "-instance={{ .Values.master.instance }}" + {{- end }} + {{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} + - "-port={{ .Values.master.port | default "8080" }}" + {{- else if gt (int .Values.master.replicaCount) 1 }} + - "-enable-leader-election" + {{- end }} + {{- if .Values.master.extraLabelNs | empty | not }} + - "-extra-label-ns={{- join "," .Values.master.extraLabelNs }}" + {{- end }} + {{- if .Values.master.denyLabelNs | empty | not }} + - "-deny-label-ns={{- join "," .Values.master.denyLabelNs }}" + {{- end }} + {{- if .Values.master.resourceLabels | empty | not }} + - "-resource-labels={{- join "," .Values.master.resourceLabels }}" + {{- end }} + {{- if .Values.master.enableTaints }} + - "-enable-taints" + {{- end }} + {{- if .Values.master.crdController | kindIs "invalid" | not }} + - "-crd-controller={{ .Values.master.crdController }}" + {{- else }} + ## By default, disable crd controller for other than the default instances + - "-crd-controller={{ .Values.master.instance | empty }}" + {{- end }} + {{- if .Values.master.featureRulesController | kindIs "invalid" | not }} + - "-featurerules-controller={{ .Values.master.featureRulesController }}" + {{- end }} + {{- if .Values.master.resyncPeriod }} + - "-resync-period={{ .Values.master.resyncPeriod }}" + {{- end }} + {{- if .Values.master.nfdApiParallelism | empty | not }} + - "-nfd-api-parallelism={{ .Values.master.nfdApiParallelism }}" + {{- end }} + {{- if .Values.tls.enable }} + - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" + - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" + - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" + {{- end }} + # Go over featureGates and add the feature-gate flag + {{- range $key, $value := .Values.featureGates }} + - "-feature-gates={{ $key }}={{ $value }}" + {{- end }} + - "-metrics={{ .Values.master.metricsPort | default "8081" }}" + volumeMounts: + {{- if .Values.tls.enable }} + - name: nfd-master-cert + mountPath: "/etc/kubernetes/node-feature-discovery/certs" + readOnly: true + {{- end }} + - name: nfd-master-conf + mountPath: "/etc/kubernetes/node-feature-discovery" + readOnly: true + volumes: + {{- if .Values.tls.enable }} + - name: nfd-master-cert + secret: + secretName: nfd-master-cert + {{- end }} + - name: nfd-master-conf + configMap: + name: {{ include "node-feature-discovery.fullname" . }}-master-conf + items: + - key: nfd-master.conf + path: nfd-master.conf + {{- with .Values.master.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/nfd-gc.yaml b/charts/node-feature-discovery/templates/nfd-gc.yaml new file mode 100644 index 0000000..4f4ac76 --- /dev/null +++ b/charts/node-feature-discovery/templates/nfd-gc.yaml @@ -0,0 +1,80 @@ +{{- if and .Values.gc.enable (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) -}} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-gc + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: gc + {{- with .Values.gc.deploymentAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + replicas: {{ .Values.gc.replicaCount | default 1 }} + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: gc + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: gc + {{- with .Values.gc.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "node-feature-discovery.gc.serviceAccountName" . }} + dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.gc.podSecurityContext | nindent 8 }} + containers: + - name: gc + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: "{{ .Values.image.pullPolicy }}" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - "nfd-gc" + args: + {{- if .Values.gc.interval | empty | not }} + - "-gc-interval={{ .Values.gc.interval }}" + {{- end }} + resources: + {{- toYaml .Values.gc.resources | nindent 12 }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true + ports: + - name: metrics + containerPort: {{ .Values.gc.metricsPort | default "8081"}} + + {{- with .Values.gc.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.gc.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.gc.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/nfd-master-conf.yaml b/charts/node-feature-discovery/templates/nfd-master-conf.yaml new file mode 100644 index 0000000..9c6e01c --- /dev/null +++ b/charts/node-feature-discovery/templates/nfd-master-conf.yaml @@ -0,0 +1,12 @@ +{{- if .Values.master.enable }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-master-conf + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +data: + nfd-master.conf: |- + {{- .Values.master.config | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml b/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml new file mode 100644 index 0000000..8d03aa2 --- /dev/null +++ b/charts/node-feature-discovery/templates/nfd-topologyupdater-conf.yaml @@ -0,0 +1,12 @@ +{{- if .Values.topologyUpdater.enable -}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +data: + nfd-topology-updater.conf: |- + {{- .Values.topologyUpdater.config | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/nfd-worker-conf.yaml b/charts/node-feature-discovery/templates/nfd-worker-conf.yaml new file mode 100644 index 0000000..a2299de --- /dev/null +++ b/charts/node-feature-discovery/templates/nfd-worker-conf.yaml @@ -0,0 +1,12 @@ +{{- if .Values.worker.enable }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker-conf + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +data: + nfd-worker.conf: |- + {{- .Values.worker.config | toYaml | nindent 4 }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/post-delete-job.yaml b/charts/node-feature-discovery/templates/post-delete-job.yaml new file mode 100644 index 0000000..4364f1a --- /dev/null +++ b/charts/node-feature-discovery/templates/post-delete-job.yaml @@ -0,0 +1,94 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/status + verbs: + - get + - patch + - update + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "node-feature-discovery.fullname" . }}-prune +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-prune + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + template: + metadata: + labels: + {{- include "node-feature-discovery.labels" . | nindent 8 }} + role: prune + spec: + serviceAccountName: {{ include "node-feature-discovery.fullname" . }}-prune + containers: + - name: nfd-master + securityContext: + {{- toYaml .Values.master.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - "nfd-master" + args: + - "-prune" + {{- if .Values.master.instance | empty | not }} + - "-instance={{ .Values.master.instance }}" + {{- end }} + restartPolicy: Never + {{- with .Values.master.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.master.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/node-feature-discovery/templates/prometheus.yaml b/charts/node-feature-discovery/templates/prometheus.yaml new file mode 100644 index 0000000..3d680e2 --- /dev/null +++ b/charts/node-feature-discovery/templates/prometheus.yaml @@ -0,0 +1,26 @@ +{{- if .Values.prometheus.enable }} +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ include "node-feature-discovery.fullname" . }} + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 4 }} + {{- with .Values.prometheus.labels }} + {{ toYaml . | nindent 4 }} + {{- end }} +spec: + podMetricsEndpoints: + - honorLabels: true + interval: {{ .Values.prometheus.scrapeInterval }} + path: /metrics + port: metrics + scheme: http + namespaceSelector: + matchNames: + - {{ include "node-feature-discovery.namespace" . }} + selector: + matchExpressions: + - {key: app.kubernetes.io/instance, operator: In, values: ["{{ .Release.Name }}"]} + - {key: app.kubernetes.io/name, operator: In, values: ["{{ include "node-feature-discovery.name" . }}"]} +{{- end }} diff --git a/charts/node-feature-discovery/templates/role.yaml b/charts/node-feature-discovery/templates/role.yaml new file mode 100644 index 0000000..3a872e5 --- /dev/null +++ b/charts/node-feature-discovery/templates/role.yaml @@ -0,0 +1,19 @@ +{{- if and .Values.worker.enable .Values.worker.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +rules: +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - create + - get + - update +{{- end }} + diff --git a/charts/node-feature-discovery/templates/rolebinding.yaml b/charts/node-feature-discovery/templates/rolebinding.yaml new file mode 100644 index 0000000..a640d5f --- /dev/null +++ b/charts/node-feature-discovery/templates/rolebinding.yaml @@ -0,0 +1,18 @@ +{{- if and .Values.worker.enable .Values.worker.rbac.create }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "node-feature-discovery.fullname" . }}-worker +subjects: +- kind: ServiceAccount + name: {{ include "node-feature-discovery.worker.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} +{{- end }} + diff --git a/charts/node-feature-discovery/templates/service.yaml b/charts/node-feature-discovery/templates/service.yaml new file mode 100644 index 0000000..7191dca --- /dev/null +++ b/charts/node-feature-discovery/templates/service.yaml @@ -0,0 +1,20 @@ +{{- if and (not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi)) .Values.master.enable }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-master + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: master +spec: + type: {{ .Values.master.service.type }} + ports: + - port: {{ .Values.master.service.port | default "8080" }} + targetPort: grpc + protocol: TCP + name: grpc + selector: + {{- include "node-feature-discovery.selectorLabels" . | nindent 4 }} + role: master +{{- end}} diff --git a/charts/node-feature-discovery/templates/serviceaccount.yaml b/charts/node-feature-discovery/templates/serviceaccount.yaml new file mode 100644 index 0000000..59edc5e --- /dev/null +++ b/charts/node-feature-discovery/templates/serviceaccount.yaml @@ -0,0 +1,58 @@ +{{- if and .Values.master.enable .Values.master.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.master.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.master.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.serviceAccount.create }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.topologyUpdater.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.gc.enable .Values.gc.serviceAccount.create (or (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) .Values.topologyUpdater.enable) }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.gc.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.gc.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} + +{{- if and .Values.worker.enable .Values.worker.serviceAccount.create }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "node-feature-discovery.worker.serviceAccountName" . }} + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + {{- with .Values.worker.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/topologyupdater-crds.yaml b/charts/node-feature-discovery/templates/topologyupdater-crds.yaml new file mode 100644 index 0000000..b6b9196 --- /dev/null +++ b/charts/node-feature-discovery/templates/topologyupdater-crds.yaml @@ -0,0 +1,278 @@ +{{- if and .Values.topologyUpdater.enable .Values.topologyUpdater.createCRDs -}} +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + api-approved.kubernetes.io: https://github.com/kubernetes/enhancements/pull/1870 + controller-gen.kubebuilder.io/version: v0.11.2 + creationTimestamp: null + name: noderesourcetopologies.topology.node.k8s.io +spec: + group: topology.node.k8s.io + names: + kind: NodeResourceTopology + listKind: NodeResourceTopologyList + plural: noderesourcetopologies + shortNames: + - node-res-topo + singular: noderesourcetopology + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: NodeResourceTopology describes node resources and their topology. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + topologyPolicies: + items: + type: string + type: array + zones: + description: ZoneList contains an array of Zone objects. + items: + description: Zone represents a resource topology zone, e.g. socket, + node, die or core. + properties: + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + costs: + description: CostList contains an array of CostInfo objects. + items: + description: CostInfo describes the cost (or distance) between + two Zones. + properties: + name: + type: string + value: + format: int64 + type: integer + required: + - name + - value + type: object + type: array + name: + type: string + parent: + type: string + resources: + description: ResourceInfoList contains an array of ResourceInfo + objects. + items: + description: ResourceInfo contains information about one resource + type. + properties: + allocatable: + anyOf: + - type: integer + - type: string + description: Allocatable quantity of the resource, corresponding + to allocatable in node status, i.e. total amount of this + resource available to be used by pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + available: + anyOf: + - type: integer + - type: string + description: Available is the amount of this resource currently + available for new (to be scheduled) pods, i.e. Allocatable + minus the resources reserved by currently running pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + capacity: + anyOf: + - type: integer + - type: string + description: Capacity of the resource, corresponding to capacity + in node status, i.e. total amount of this resource that + the node has. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + name: + description: Name of the resource. + type: string + required: + - allocatable + - available + - capacity + - name + type: object + type: array + type: + type: string + required: + - name + - type + type: object + type: array + required: + - topologyPolicies + - zones + type: object + served: true + storage: false + - name: v1alpha2 + schema: + openAPIV3Schema: + description: NodeResourceTopology describes node resources and their topology. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + topologyPolicies: + description: 'DEPRECATED (to be removed in v1beta1): use top level attributes + if needed' + items: + type: string + type: array + zones: + description: ZoneList contains an array of Zone objects. + items: + description: Zone represents a resource topology zone, e.g. socket, + node, die or core. + properties: + attributes: + description: AttributeList contains an array of AttributeInfo objects. + items: + description: AttributeInfo contains one attribute of a Zone. + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + costs: + description: CostList contains an array of CostInfo objects. + items: + description: CostInfo describes the cost (or distance) between + two Zones. + properties: + name: + type: string + value: + format: int64 + type: integer + required: + - name + - value + type: object + type: array + name: + type: string + parent: + type: string + resources: + description: ResourceInfoList contains an array of ResourceInfo + objects. + items: + description: ResourceInfo contains information about one resource + type. + properties: + allocatable: + anyOf: + - type: integer + - type: string + description: Allocatable quantity of the resource, corresponding + to allocatable in node status, i.e. total amount of this + resource available to be used by pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + available: + anyOf: + - type: integer + - type: string + description: Available is the amount of this resource currently + available for new (to be scheduled) pods, i.e. Allocatable + minus the resources reserved by currently running pods. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + capacity: + anyOf: + - type: integer + - type: string + description: Capacity of the resource, corresponding to capacity + in node status, i.e. total amount of this resource that + the node has. + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + name: + description: Name of the resource. + type: string + required: + - allocatable + - available + - capacity + - name + type: object + type: array + type: + type: string + required: + - name + - type + type: object + type: array + required: + - zones + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] +{{- end }} diff --git a/charts/node-feature-discovery/templates/topologyupdater.yaml b/charts/node-feature-discovery/templates/topologyupdater.yaml new file mode 100644 index 0000000..1221cfd --- /dev/null +++ b/charts/node-feature-discovery/templates/topologyupdater.yaml @@ -0,0 +1,170 @@ +{{- if .Values.topologyUpdater.enable -}} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: topology-updater + {{- with .Values.topologyUpdater.daemonsetAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: topology-updater + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: topology-updater + {{- with .Values.topologyUpdater.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "node-feature-discovery.topologyUpdater.serviceAccountName" . }} + dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.topologyUpdater.podSecurityContext | nindent 8 }} + containers: + - name: topology-updater + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: "{{ .Values.image.pullPolicy }}" + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 10 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: NODE_ADDRESS + valueFrom: + fieldRef: + fieldPath: status.hostIP + command: + - "nfd-topology-updater" + args: + - "-podresources-socket=/host-var/lib/kubelet-podresources/kubelet.sock" + {{- if .Values.topologyUpdater.updateInterval | empty | not }} + - "-sleep-interval={{ .Values.topologyUpdater.updateInterval }}" + {{- else }} + - "-sleep-interval=3s" + {{- end }} + {{- if .Values.topologyUpdater.watchNamespace | empty | not }} + - "-watch-namespace={{ .Values.topologyUpdater.watchNamespace }}" + {{- else }} + - "-watch-namespace=*" + {{- end }} + {{- if .Values.tls.enable }} + - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" + - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" + - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" + {{- end }} + {{- if not .Values.topologyUpdater.podSetFingerprint }} + - "-pods-fingerprint=false" + {{- end }} + {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} + - "-kubelet-config-uri=file:///host-var/kubelet-config" + {{- end }} + {{- if .Values.topologyUpdater.kubeletStateDir | empty }} + # Disable kubelet state tracking by giving an empty path + - "-kubelet-state-dir=" + {{- end }} + - -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}} + ports: + - name: metrics + containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}} + volumeMounts: + {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} + - name: kubelet-config + mountPath: /host-var/kubelet-config + {{- end }} + - name: kubelet-podresources-sock + mountPath: /host-var/lib/kubelet-podresources/kubelet.sock + - name: host-sys + mountPath: /host-sys + {{- if .Values.topologyUpdater.kubeletStateDir | empty | not }} + - name: kubelet-state-files + mountPath: /host-var/lib/kubelet + readOnly: true + {{- end }} + {{- if .Values.tls.enable }} + - name: nfd-topology-updater-cert + mountPath: "/etc/kubernetes/node-feature-discovery/certs" + readOnly: true + {{- end }} + - name: nfd-topology-updater-conf + mountPath: "/etc/kubernetes/node-feature-discovery" + readOnly: true + + resources: + {{- toYaml .Values.topologyUpdater.resources | nindent 12 }} + securityContext: + {{- toYaml .Values.topologyUpdater.securityContext | nindent 12 }} + volumes: + - name: host-sys + hostPath: + path: "/sys" + {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} + - name: kubelet-config + hostPath: + path: {{ .Values.topologyUpdater.kubeletConfigPath }} + {{- end }} + - name: kubelet-podresources-sock + hostPath: + {{- if .Values.topologyUpdater.kubeletPodResourcesSockPath | empty | not }} + path: {{ .Values.topologyUpdater.kubeletPodResourcesSockPath }} + {{- else }} + path: /var/lib/kubelet/pod-resources/kubelet.sock + {{- end }} + {{- if .Values.topologyUpdater.kubeletStateDir | empty | not }} + - name: kubelet-state-files + hostPath: + path: {{ .Values.topologyUpdater.kubeletStateDir }} + {{- end }} + - name: nfd-topology-updater-conf + configMap: + name: {{ include "node-feature-discovery.fullname" . }}-topology-updater-conf + items: + - key: nfd-topology-updater.conf + path: nfd-topology-updater.conf + {{- if .Values.tls.enable }} + - name: nfd-topology-updater-cert + secret: + secretName: nfd-topology-updater-cert + {{- end }} + + + {{- with .Values.topologyUpdater.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.topologyUpdater.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.topologyUpdater.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/charts/node-feature-discovery/templates/worker.yaml b/charts/node-feature-discovery/templates/worker.yaml new file mode 100644 index 0000000..f2a2419 --- /dev/null +++ b/charts/node-feature-discovery/templates/worker.yaml @@ -0,0 +1,185 @@ +{{- if .Values.worker.enable }} +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "node-feature-discovery.fullname" . }}-worker + namespace: {{ include "node-feature-discovery.namespace" . }} + labels: + {{- include "node-feature-discovery.labels" . | nindent 4 }} + role: worker + {{- with .Values.worker.daemonsetAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 6 }} + role: worker + template: + metadata: + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 8 }} + role: worker + {{- with .Values.worker.annotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + dnsPolicy: ClusterFirstWithHostNet + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "node-feature-discovery.worker.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.worker.podSecurityContext | nindent 8 }} + containers: + - name: worker + securityContext: + {{- toYaml .Values.worker.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + livenessProbe: + grpc: + port: 8082 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + grpc: + port: 8082 + initialDelaySeconds: 5 + periodSeconds: 10 + failureThreshold: 10 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + resources: + {{- toYaml .Values.worker.resources | nindent 12 }} + command: + - "nfd-worker" + args: +{{- if not (and .Values.featureGates.NodeFeatureAPI .Values.enableNodeFeatureApi) }} + - "-server={{ include "node-feature-discovery.fullname" . }}-master:{{ .Values.master.service.port }}" +{{- end }} +{{- if .Values.tls.enable }} + - "-ca-file=/etc/kubernetes/node-feature-discovery/certs/ca.crt" + - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" + - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" +{{- end }} +# Go over featureGate and add the feature-gate flag +{{- range $key, $value := .Values.featureGates }} + - "-feature-gates={{ $key }}={{ $value }}" +{{- end }} + - "-metrics={{ .Values.worker.metricsPort | default "8081"}}" + ports: + - name: metrics + containerPort: {{ .Values.worker.metricsPort | default "8081"}} + volumeMounts: + - name: host-boot + mountPath: "/host-boot" + readOnly: true + - name: host-os-release + mountPath: "/host-etc/os-release" + readOnly: true + - name: host-sys + mountPath: "/host-sys" + readOnly: true + - name: host-usr-lib + mountPath: "/host-usr/lib" + readOnly: true + - name: host-lib + mountPath: "/host-lib" + readOnly: true + - name: host-proc-swaps + mountPath: "/host-proc/swaps" + readOnly: true + {{- if .Values.worker.mountUsrSrc }} + - name: host-usr-src + mountPath: "/host-usr/src" + readOnly: true + {{- end }} + - name: source-d + mountPath: "/etc/kubernetes/node-feature-discovery/source.d/" + readOnly: true + - name: features-d + mountPath: "/etc/kubernetes/node-feature-discovery/features.d/" + readOnly: true + - name: nfd-worker-conf + mountPath: "/etc/kubernetes/node-feature-discovery" + readOnly: true +{{- if .Values.tls.enable }} + - name: nfd-worker-cert + mountPath: "/etc/kubernetes/node-feature-discovery/certs" + readOnly: true +{{- end }} + volumes: + - name: host-boot + hostPath: + path: "/boot" + - name: host-os-release + hostPath: + path: "/etc/os-release" + - name: host-sys + hostPath: + path: "/sys" + - name: host-usr-lib + hostPath: + path: "/usr/lib" + - name: host-lib + hostPath: + path: "/lib" + - name: host-proc-swaps + hostPath: + path: "/proc/swaps" + {{- if .Values.worker.mountUsrSrc }} + - name: host-usr-src + hostPath: + path: "/usr/src" + {{- end }} + - name: source-d + hostPath: + path: "/etc/kubernetes/node-feature-discovery/source.d/" + - name: features-d + hostPath: + path: "/etc/kubernetes/node-feature-discovery/features.d/" + - name: nfd-worker-conf + configMap: + name: {{ include "node-feature-discovery.fullname" . }}-worker-conf + items: + - key: nfd-worker.conf + path: nfd-worker.conf +{{- if .Values.tls.enable }} + - name: nfd-worker-cert + secret: + secretName: nfd-worker-cert +{{- end }} + {{- with .Values.worker.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.worker.priorityClassName }} + priorityClassName: {{ . | quote }} + {{- end }} +{{- end }} diff --git a/charts/node-feature-discovery/values.yaml b/charts/node-feature-discovery/values.yaml new file mode 100644 index 0000000..41be039 --- /dev/null +++ b/charts/node-feature-discovery/values.yaml @@ -0,0 +1,558 @@ +image: + repository: registry.k8s.io/nfd/node-feature-discovery + # This should be set to 'IfNotPresent' for released version + pullPolicy: IfNotPresent + # tag, if defined will use the given image tag, else Chart.AppVersion will be used + # tag +imagePullSecrets: [] + +nameOverride: "" +fullnameOverride: "" +namespaceOverride: "" + +enableNodeFeatureApi: true + +featureGates: + NodeFeatureAPI: true + NodeFeatureGroupAPI: false + +priorityClassName: "" + +master: + enable: true + config: ### + # noPublish: false + # autoDefaultNs: true + # extraLabelNs: ["added.ns.io","added.kubernets.io"] + # denyLabelNs: ["denied.ns.io","denied.kubernetes.io"] + # resourceLabels: ["vendor-1.com/feature-1","vendor-2.io/feature-2"] + # enableTaints: false + # labelWhiteList: "foo" + # resyncPeriod: "2h" + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time configurable + ## and require a nfd-master restart to take effect after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false + # leaderElection: + # leaseDuration: 15s + # # this value has to be lower than leaseDuration and greater than retryPeriod*1.2 + # renewDeadline: 10s + # # this value has to be greater than 0 + # retryPeriod: 2s + # nfdApiParallelism: 10 + ### + # The TCP port that nfd-master listens for incoming requests. Default: 8080 + # Deprecated this parameter is related to the deprecated gRPC API and will + # be removed with it in a future release + port: 8080 + metricsPort: 8081 + instance: + featureApi: + resyncPeriod: + denyLabelNs: [] + extraLabelNs: [] + resourceLabels: [] + enableTaints: false + crdController: null + featureRulesController: null + nfdApiParallelism: null + deploymentAnnotations: {} + replicaCount: 1 + + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true + # runAsUser: 1000 + + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + rbac: + create: true + + service: + type: ClusterIP + port: 8080 + + resources: + limits: + memory: 4Gi + requests: + cpu: 100m + # You may want to use the same value for `requests.memory` and `limits.memory`. The “requests” value affects scheduling to accommodate pods on nodes. + # If there is a large difference between “requests” and “limits” and nodes experience memory pressure, the kernel may invoke + # the OOM Killer, even if the memory does not exceed the “limits” threshold. This can cause unexpected pod evictions. Memory + # cannot be compressed and once allocated to a pod, it can only be reclaimed by killing the pod. + # Natan Yellin 22/09/2022 https://home.robusta.dev/blog/kubernetes-memory-limit + memory: 128Mi + + nodeSelector: {} + + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + + annotations: {} + + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/master" + operator: In + values: [""] + - weight: 1 + preference: + matchExpressions: + - key: "node-role.kubernetes.io/control-plane" + operator: In + values: [""] + +worker: + enable: true + config: ### + #core: + # labelWhiteList: + # noPublish: false + # sleepInterval: 60s + # featureSources: [all] + # labelSources: [all] + # klog: + # addDirHeader: false + # alsologtostderr: false + # logBacktraceAt: + # logtostderr: true + # skipHeaders: false + # stderrthreshold: 2 + # v: 0 + # vmodule: + ## NOTE: the following options are not dynamically run-time configurable + ## and require a nfd-worker restart to take effect after being changed + # logDir: + # logFile: + # logFileMaxSize: 1800 + # skipLogHeaders: false + #sources: + # cpu: + # cpuid: + ## NOTE: whitelist has priority over blacklist + # attributeBlacklist: + # - "AVX10" + # - "BMI1" + # - "BMI2" + # - "CLMUL" + # - "CMOV" + # - "CX16" + # - "ERMS" + # - "F16C" + # - "HTT" + # - "LZCNT" + # - "MMX" + # - "MMXEXT" + # - "NX" + # - "POPCNT" + # - "RDRAND" + # - "RDSEED" + # - "RDTSCP" + # - "SGX" + # - "SSE" + # - "SSE2" + # - "SSE3" + # - "SSE4" + # - "SSE42" + # - "SSSE3" + # - "TDX_GUEST" + # attributeWhitelist: + # kernel: + # kconfigFile: "/path/to/kconfig" + # configOpts: + # - "NO_HZ" + # - "X86" + # - "DMI" + # pci: + # deviceClassWhitelist: + # - "0200" + # - "03" + # - "12" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # - "subsystem_vendor" + # - "subsystem_device" + # usb: + # deviceClassWhitelist: + # - "0e" + # - "ef" + # - "fe" + # - "ff" + # deviceLabelFields: + # - "class" + # - "vendor" + # - "device" + # local: + # hooksEnabled: false + # custom: + # # The following feature demonstrates the capabilities of the matchFeatures + # - name: "my custom rule" + # labels: + # "vendor.io/my-ng-feature": "true" + # # matchFeatures implements a logical AND over all matcher terms in the + # # list (i.e. all of the terms, or per-feature matchers, must match) + # matchFeatures: + # - feature: cpu.cpuid + # matchExpressions: + # AVX512F: {op: Exists} + # - feature: cpu.cstate + # matchExpressions: + # enabled: {op: IsTrue} + # - feature: cpu.pstate + # matchExpressions: + # no_turbo: {op: IsFalse} + # scaling_governor: {op: In, value: ["performance"]} + # - feature: cpu.rdt + # matchExpressions: + # RDTL3CA: {op: Exists} + # - feature: cpu.sst + # matchExpressions: + # bf.enabled: {op: IsTrue} + # - feature: cpu.topology + # matchExpressions: + # hardware_multithreading: {op: IsFalse} + # + # - feature: kernel.config + # matchExpressions: + # X86: {op: Exists} + # LSM: {op: InRegexp, value: ["apparmor"]} + # - feature: kernel.loadedmodule + # matchExpressions: + # e1000e: {op: Exists} + # - feature: kernel.selinux + # matchExpressions: + # enabled: {op: IsFalse} + # - feature: kernel.version + # matchExpressions: + # major: {op: In, value: ["5"]} + # minor: {op: Gt, value: ["10"]} + # + # - feature: storage.block + # matchExpressions: + # rotational: {op: In, value: ["0"]} + # dax: {op: In, value: ["0"]} + # + # - feature: network.device + # matchExpressions: + # operstate: {op: In, value: ["up"]} + # speed: {op: Gt, value: ["100"]} + # + # - feature: memory.numa + # matchExpressions: + # node_count: {op: Gt, value: ["2"]} + # - feature: memory.nv + # matchExpressions: + # devtype: {op: In, value: ["nd_dax"]} + # mode: {op: In, value: ["memory"]} + # + # - feature: system.osrelease + # matchExpressions: + # ID: {op: In, value: ["fedora", "centos"]} + # - feature: system.name + # matchExpressions: + # nodename: {op: InRegexp, value: ["^worker-X"]} + # + # - feature: local.label + # matchExpressions: + # custom-feature-knob: {op: Gt, value: ["100"]} + # + # # The following feature demonstrates the capabilities of the matchAny + # - name: "my matchAny rule" + # labels: + # "vendor.io/my-ng-feature-2": "my-value" + # # matchAny implements a logical IF over all elements (sub-matchers) in + # # the list (i.e. at least one feature matcher must match) + # matchAny: + # - matchFeatures: + # - feature: kernel.loadedmodule + # matchExpressions: + # driver-module-X: {op: Exists} + # - feature: pci.device + # matchExpressions: + # vendor: {op: In, value: ["8086"]} + # class: {op: In, value: ["0200"]} + # - matchFeatures: + # - feature: kernel.loadedmodule + # matchExpressions: + # driver-module-Y: {op: Exists} + # - feature: usb.device + # matchExpressions: + # vendor: {op: In, value: ["8086"]} + # class: {op: In, value: ["02"]} + # + # - name: "avx wildcard rule" + # labels: + # "my-avx-feature": "true" + # matchFeatures: + # - feature: cpu.cpuid + # matchName: {op: InRegexp, value: ["^AVX512"]} + # + # # The following features demonstreate label templating capabilities + # - name: "my template rule" + # labelsTemplate: | + # {{ range .system.osrelease }}vendor.io/my-system-feature.{{ .Name }}={{ .Value }} + # {{ end }} + # matchFeatures: + # - feature: system.osrelease + # matchExpressions: + # ID: {op: InRegexp, value: ["^open.*"]} + # VERSION_ID.major: {op: In, value: ["13", "15"]} + # + # - name: "my template rule 2" + # labelsTemplate: | + # {{ range .pci.device }}vendor.io/my-pci-device.{{ .class }}-{{ .device }}=with-cpuid + # {{ end }} + # matchFeatures: + # - feature: pci.device + # matchExpressions: + # class: {op: InRegexp, value: ["^06"]} + # vendor: ["8086"] + # - feature: cpu.cpuid + # matchExpressions: + # AVX: {op: Exists} + # + # # The following examples demonstrate vars field and back-referencing + # # previous labels and vars + # - name: "my dummy kernel rule" + # labels: + # "vendor.io/my.kernel.feature": "true" + # matchFeatures: + # - feature: kernel.version + # matchExpressions: + # major: {op: Gt, value: ["2"]} + # + # - name: "my dummy rule with no labels" + # vars: + # "my.dummy.var": "1" + # matchFeatures: + # - feature: cpu.cpuid + # matchExpressions: {} + # + # - name: "my rule using backrefs" + # labels: + # "vendor.io/my.backref.feature": "true" + # matchFeatures: + # - feature: rule.matched + # matchExpressions: + # vendor.io/my.kernel.feature: {op: IsTrue} + # my.dummy.var: {op: Gt, value: ["0"]} + # + # - name: "kconfig template rule" + # labelsTemplate: | + # {{ range .kernel.config }}kconfig-{{ .Name }}={{ .Value }} + # {{ end }} + # matchFeatures: + # - feature: kernel.config + # matchName: {op: In, value: ["SWAP", "X86", "ARM"]} +### + + metricsPort: 8081 + daemonsetAnnotations: {} + podSecurityContext: {} + # fsGroup: 2000 + + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsNonRoot: true + # runAsUser: 1000 + + # livenessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 10 + # periodSeconds: 10 + # readinessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 5 + # periodSeconds: 10 + # failureThreshold: 10 + + serviceAccount: + # Specifies whether a service account should be created. + # We create this by default to make it easier for downstream users to apply PodSecurityPolicies. + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + + rbac: + create: true + + # Allow users to mount the hostPath /usr/src, useful for RHCOS on s390x + # Does not work on systems without /usr/src AND a read-only /usr, such as Talos + mountUsrSrc: false + + resources: + limits: + memory: 512Mi + requests: + cpu: 5m + memory: 64Mi + + nodeSelector: {} + + tolerations: [] + + annotations: {} + + affinity: {} + + priorityClassName: "" + +topologyUpdater: + config: ### + ## key = node name, value = list of resources to be excluded. + ## use * to exclude from all nodes. + ## an example for how the exclude list should looks like + #excludeList: + # node1: [cpu] + # node2: [memory, example/deviceA] + # *: [hugepages-2Mi] +### + + enable: false + createCRDs: false + + serviceAccount: + create: true + annotations: {} + name: + rbac: + create: true + + metricsPort: 8081 + kubeletConfigPath: + kubeletPodResourcesSockPath: + updateInterval: 60s + watchNamespace: "*" + kubeletStateDir: /var/lib/kubelet + + podSecurityContext: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: [ "ALL" ] + readOnlyRootFilesystem: true + runAsUser: 0 + + # livenessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 10 + # periodSeconds: 10 + # readinessProbe: {} + ## NOTE: Currently not configurable, defaults are provided for the sake of extra documentation. + # grpc: + # port: 8082 + # initialDelaySeconds: 5 + # periodSeconds: 10 + # failureThreshold: 10 + + resources: + limits: + memory: 60Mi + requests: + cpu: 50m + memory: 40Mi + + nodeSelector: {} + tolerations: [] + annotations: {} + daemonsetAnnotations: {} + affinity: {} + podSetFingerprint: true + +gc: + enable: true + replicaCount: 1 + + serviceAccount: + create: true + annotations: {} + name: + rbac: + create: true + + interval: 1h + + podSecurityContext: {} + + resources: + limits: + memory: 1Gi + requests: + cpu: 10m + memory: 128Mi + + metricsPort: 8081 + + nodeSelector: {} + tolerations: [] + annotations: {} + deploymentAnnotations: {} + affinity: {} + +# Optionally use encryption for worker <--> master comms +# TODO: verify hostname is not yet supported +# +# If you do not enable certManager (and have it installed) you will +# need to manually, or otherwise, provision the TLS certs as secrets +tls: + enable: false + certManager: false + certManagerCertificate: + issuerKind: + issuerName: + +prometheus: + enable: false + scrapeInterval: 10s + labels: {} diff --git a/crds/deviceconfig-crd.yaml b/crds/deviceconfig-crd.yaml new file mode 100644 index 0000000..6ecf7be --- /dev/null +++ b/crds/deviceconfig-crd.yaml @@ -0,0 +1,1604 @@ +--- +# Source: gpu-operator-charts/templates/deviceconfig-crd.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: deviceconfigs.amd.com + annotations: + controller-gen.kubebuilder.io/version: v0.17.0 + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + helm.sh/chart: gpu-operator-charts-v1.4.0 + app.kubernetes.io/name: gpu-operator-charts + app.kubernetes.io/instance: amd-gpu + app.kubernetes.io/version: "v1.4.0" + app.kubernetes.io/managed-by: Helm +spec: + group: amd.com + names: + kind: DeviceConfig + listKind: DeviceConfigList + plural: deviceconfigs + shortNames: + - gpue + singular: deviceconfig + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: DeviceConfig describes how to enable AMD GPU device + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: DeviceConfigSpec describes how the AMD GPU operator should + enable AMD GPU device for customer's use. + properties: + commonConfig: + description: common config + properties: + initContainerImage: + description: InitContainerImage is being used for the operands pods, + i.e. metrics exporter, test runner, device plugin, device config + manager and node labeller + type: string + utilsContainer: + description: UtilsContainer contains parameters to configure operator's + utils container + properties: + image: + description: Image is the image of utils container + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + imagePullPolicy: + description: image pull policy for utils container + enum: + - Always + - IfNotPresent + - Never + type: string + imageRegistrySecret: + description: secret used for pull utils container image + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: object + type: object + configManager: + description: config manager + properties: + config: + description: config map to customize the config for config manager, + if not specified default config will be applied + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + configManagerTolerations: + description: tolerations for the device config manager DaemonSet + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + enable: + description: enable config manager, disabled by default + type: boolean + image: + description: config manager image + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + imagePullPolicy: + description: image pull policy for config manager + enum: + - Always + - IfNotPresent + - Never + type: string + imageRegistrySecret: + description: config manager image registry secret used to pull/push + images + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + selector: + additionalProperties: + type: string + description: Selector describes on which nodes to enable config + manager + type: object + upgradePolicy: + description: upgrade policy for config manager daemonset + properties: + maxUnavailable: + default: 1 + description: MaxUnavailable specifies the maximum number of + Pods that can be unavailable during the update process. Applicable + for RollingUpdate only. Default value is 1. + format: int32 + type: integer + upgradeStrategy: + description: UpgradeStrategy specifies the type of the DaemonSet + update. Valid values are "RollingUpdate" (default) or "OnDelete". + enum: + - RollingUpdate + - OnDelete + type: string + type: object + type: object + devicePlugin: + description: device plugin + properties: + devicePluginArguments: + additionalProperties: + type: string + description: |- + device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset + supported flag values: {"resource_naming_strategy": {"single", "mixed"}} + type: object + devicePluginImage: + description: device plugin image + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + devicePluginImagePullPolicy: + description: image pull policy for device plugin + enum: + - Always + - IfNotPresent + - Never + type: string + devicePluginTolerations: + description: tolerations for the device plugin DaemonSet + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + enableNodeLabeller: + default: true + description: enable or disable the node labeller + type: boolean + imageRegistrySecret: + description: node labeller image registry secret used to pull/push + images + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + nodeLabellerArguments: + description: |- + node labeller arguments is used to pass supported labels while starting node labeller daemonset + some flags are enabled by default as they are applicable and bare minimum for all setups and are supported in all versions of node labeller + default flags: {"vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"} + supported flags: {"compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"} + items: + type: string + type: array + nodeLabellerImage: + description: node labeller image + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + nodeLabellerImagePullPolicy: + description: image pull policy for node labeller + enum: + - Always + - IfNotPresent + - Never + type: string + nodeLabellerTolerations: + description: tolerations for the node labeller DaemonSet + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + upgradePolicy: + description: upgrade policy for device plugin and node labeller + daemons + properties: + maxUnavailable: + default: 1 + description: MaxUnavailable specifies the maximum number of + Pods that can be unavailable during the update process. Applicable + for RollingUpdate only. Default value is 1. + format: int32 + type: integer + upgradeStrategy: + description: UpgradeStrategy specifies the type of the DaemonSet + update. Valid values are "RollingUpdate" (default) or "OnDelete". + enum: + - RollingUpdate + - OnDelete + type: string + type: object + type: object + driver: + description: driver + properties: + amdgpuInstallerRepoURL: + description: |- + radeon repo URL for fetching amdgpu installer if building driver image on the fly + installer URL is https://repo.radeon.com/amdgpu-install by default + type: string + blacklist: + description: |- + blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes. + Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist. + Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module + type: boolean + driverType: + default: container + description: |- + specify the type of driver (container/vf-passthrough/pf-passthrough) to install on the worker node. default value is container. + container: normal amdgpu-dkms driver for Bare Metal GPU nodes or guest VM. + vf-passthrough: MxGPU GIM driver on the host machine to generate VF, then mount VF to vfio-pci + pf-passthrough: directly mount PF device to vfio-pci + enum: + - container + - vf-passthrough + - pf-passthrough + type: string + enable: + default: true + description: |- + enable driver install. default value is true. + disable is for skipping driver install/uninstall for dryrun or using in-tree amdgpu kernel module + type: boolean + image: + description: |- + defines image that includes drivers and firmware blobs, don't include tag since it will be fully managed by operator + for vanilla k8s the default value is image-registry:5000/$MOD_NAMESPACE/amdgpu_kmod + for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod + image tag will be in the format of --- + example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3 + NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + imageBuild: + description: image build configs + properties: + baseImageRegistry: + default: docker.io + description: |- + image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry + e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04 + NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image + type: string + baseImageRegistryTLS: + description: TLS settings for fetching base image + properties: + insecure: + description: If true, check if the container image already + exists using plain HTTP. + type: boolean + insecureSkipTLSVerify: + description: If true, skip any TLS server certificate validation + type: boolean + type: object + type: object + imageRegistrySecret: + description: secrets used for pull/push images from/to private registry + specified in driversImage + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + imageRegistryTLS: + description: driver image registry TLS setting for the container + image + properties: + insecure: + description: If true, check if the container image already exists + using plain HTTP. + type: boolean + insecureSkipTLSVerify: + description: If true, skip any TLS server certificate validation + type: boolean + type: object + imageSign: + description: |- + image signing config to sign the driver image when building driver image on the fly + image signing is required for installing driver on secure boot enabled system + properties: + certSecret: + description: |- + ImageSignCertSecret the public key used to sign kernel modules within image + necessary for secure boot enabled system + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + keySecret: + description: |- + ImageSignKeySecret the private key used to sign kernel modules within image + necessary for secure boot enabled system + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: object + kernelModuleConfig: + description: advanced arguments, parameters and more configs to + manage tne driver + properties: + loadArgs: + description: LoadArg are the arguments when modprobe is executed + to load the kernel module. The command will be `modprobe ${Args} + module_name`. + items: + type: string + type: array + parameters: + description: Parameters is being used for modprobe commands. + The command will be `modprobe ${Args} module_name ${Parameters}`. + items: + type: string + type: array + unloadArgs: + description: UnloadArg are the arguments when modprobe is executed + to unload the kernel module. The command will be `modprobe + -r ${Args} module_name`. + items: + type: string + type: array + type: object + tolerations: + description: tolerations for kmm module object + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + upgradePolicy: + description: policy to upgrade the drivers + properties: + enable: + description: |- + enable upgrade policy, disabled by default + If disabled, user has to manually upgrade all the nodes. + type: boolean + maxParallelUpgrades: + default: 1 + description: |- + MaxParallelUpgrades indicates how many nodes can be upgraded in parallel + 0 means no limit, all nodes will be upgraded in parallel + minimum: 0 + type: integer + maxUnavailableNodes: + anyOf: + - type: integer + - type: string + default: 25% + description: |- + MaxUnavailableNodes indicates maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state + Value can be an integer (ex: 2) which would mean atmost 2 nodes can be in failed state after which new upgrades will not start. Or it can be a percentage string(ex: "50%") from which absolute number will be calculated and round up + x-kubernetes-int-or-string: true + nodeDrainPolicy: + description: Node draining policy + properties: + force: + default: false + description: Force indicates if force draining is allowed + type: boolean + gracePeriodSeconds: + default: -1 + description: GracePeriodSeconds indicates the time kubernetes + waits for a pod to shut down gracefully after receiving + a termination signal + type: integer + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time + in seconds to wait before giving up drain, zero means + infinite + minimum: 0 + type: integer + type: object + podDeletionPolicy: + description: Pod Deletion policy. If both NodeDrainPolicy and + PodDeletionPolicy config is available, NodeDrainPolicy(if + enabled) will take precedence. + properties: + force: + default: false + description: Force indicates if force deletion is allowed + type: boolean + gracePeriodSeconds: + default: -1 + description: GracePeriodSeconds indicates the time kubernetes + waits for a pod to shut down gracefully after receiving + a termination signal + type: integer + timeoutSeconds: + default: 300 + description: TimeoutSecond specifies the length of time + in seconds to wait before giving up on pod deletion, zero + means infinite + minimum: 0 + type: integer + type: object + rebootRequired: + default: true + description: reboot between driver upgrades, enabled by default, + if enabled spec.commonConfig.utilsContainer will be used to + perform reboot on worker nodes + type: boolean + type: object + version: + description: |- + version of the drivers source code, can be used as part of image of dockerfile source image + default value for different OS is: ubuntu: 6.1.3, coreOS: 6.2.2 + type: string + vfioConfig: + description: |- + vfio config + specify the specific configs for binding PCI devices to vfio-pci kernel module, applies for driver type vf-passthrough and pf-passthrough + properties: + deviceIDs: + description: list of PCI device IDs to load into vfio-pci driver. + default is the list of AMD GPU PF/VF PCI device IDs based + on driver type vf-passthrough/pf-passthrough. + items: + type: string + type: array + type: object + type: object + metricsExporter: + description: metrics exporter + properties: + config: + description: optional configuration for metrics + properties: + name: + description: |- + Name of the configMap that defines the list of metrics + default list:[] + type: string + type: object + enable: + description: enable metrics exporter, disabled by default + type: boolean + image: + description: metrics exporter image + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + imagePullPolicy: + description: image pull policy for metrics exporter + enum: + - Always + - IfNotPresent + - Never + type: string + imageRegistrySecret: + description: metrics exporter image registry secret used to pull/push + images + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + nodePort: + description: NodePort is the external port for pulling metrics from + outside the cluster, in the range 30000-32767 (assigned automatically + by default) + format: int32 + maximum: 32767 + minimum: 30000 + type: integer + podResourceAPISocketPath: + default: /var/lib/kubelet/pod-resources + description: |- + Set the host path for pod-resource kubelet.socket, + vanila kubernetes path is /var/lib/kubelet/pod-resources + microk8s path is /var/snap/microk8s/common/var/lib/kubelet/pod-resources/ + path is an absolute unix path that allows a trailing slash + pattern: ^(/[^/\0]+)*(/)?$ + type: string + port: + default: 5000 + description: Port is the internal port used for in-cluster and node + access to pull metrics from the metrics-exporter (default 5000). + format: int32 + type: integer + prometheus: + description: Prometheus configuration for metrics exporter + properties: + serviceMonitor: + description: ServiceMonitor configuration for Prometheus integration + properties: + attachMetadata: + description: AttachMetadata defines if Prometheus should + attach node metadata to the target + properties: + node: + description: |- + When set to true, Prometheus attaches node metadata to the discovered + targets. + + The Prometheus service account must have the `list` and `watch` + permissions on the `Nodes` objects. + type: boolean + type: object + authorization: + description: Optional Prometheus authorization configuration + for accessing the endpoint + properties: + credentials: + description: Selects a key of a Secret in the namespace + that contains the credentials for authentication. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: + description: |- + Defines the authentication type. The value is case-insensitive. + + "Basic" is not a supported value. + + Default: "Bearer" + type: string + type: object + bearerTokenFile: + description: |- + Path to bearer token file to be used by Prometheus (e.g., service account token path) + Deprecated: Use Authorization instead. This field is kept for backward compatibility. + type: string + enable: + description: Enable or disable ServiceMonitor creation (default + false) + type: boolean + honorLabels: + default: true + description: HonorLabels chooses the metric's labels on + collisions with target labels (default true) + type: boolean + honorTimestamps: + description: HonorTimestamps controls whether the scrape + endpoints honor timestamps (default false) + type: boolean + interval: + description: 'How frequently to scrape metrics. Accepts + values with time unit suffix: "30s", "1m", "2h", "500ms"' + pattern: ^([0-9]+)(ms|s|m|h)$ + type: string + labels: + additionalProperties: + type: string + description: 'Additional labels to add to the ServiceMonitor + (default release: prometheus)' + type: object + metricRelabelings: + description: Relabeling rules applied to individual scraped + metrics + items: + description: |- + RelabelConfig allows dynamic rewriting of the label set for targets, alerts, + scraped samples and remote write samples. + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + properties: + action: + default: replace + description: |- + Action to perform based on the regex matching. + + `Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0. + `DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0. + + Default: "Replace" + enum: + - replace + - Replace + - keep + - Keep + - drop + - Drop + - hashmod + - HashMod + - labelmap + - LabelMap + - labeldrop + - LabelDrop + - labelkeep + - LabelKeep + - lowercase + - Lowercase + - uppercase + - Uppercase + - keepequal + - KeepEqual + - dropequal + - DropEqual + type: string + modulus: + description: |- + Modulus to take of the hash of the source label values. + + Only applicable when the action is `HashMod`. + format: int64 + type: integer + regex: + description: Regular expression against which the + extracted value is matched. + type: string + replacement: + description: |- + Replacement value against which a Replace action is performed if the + regular expression matches. + + Regex capture groups are available. + type: string + separator: + description: Separator is the string between concatenated + SourceLabels. + type: string + sourceLabels: + description: |- + The source labels select values from existing labels. Their content is + concatenated using the configured Separator and matched against the + configured regular expression. + items: + description: |- + LabelName is a valid Prometheus label name which may only contain ASCII + letters, numbers, as well as underscores. + pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$ + type: string + type: array + targetLabel: + description: |- + Label to which the resulting string is written in a replacement. + + It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`, + `KeepEqual` and `DropEqual` actions. + + Regex capture groups are available. + type: string + type: object + type: array + relabelings: + description: RelabelConfigs to apply to samples before ingestion + items: + description: |- + RelabelConfig allows dynamic rewriting of the label set for targets, alerts, + scraped samples and remote write samples. + + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + properties: + action: + default: replace + description: |- + Action to perform based on the regex matching. + + `Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0. + `DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0. + + Default: "Replace" + enum: + - replace + - Replace + - keep + - Keep + - drop + - Drop + - hashmod + - HashMod + - labelmap + - LabelMap + - labeldrop + - LabelDrop + - labelkeep + - LabelKeep + - lowercase + - Lowercase + - uppercase + - Uppercase + - keepequal + - KeepEqual + - dropequal + - DropEqual + type: string + modulus: + description: |- + Modulus to take of the hash of the source label values. + + Only applicable when the action is `HashMod`. + format: int64 + type: integer + regex: + description: Regular expression against which the + extracted value is matched. + type: string + replacement: + description: |- + Replacement value against which a Replace action is performed if the + regular expression matches. + + Regex capture groups are available. + type: string + separator: + description: Separator is the string between concatenated + SourceLabels. + type: string + sourceLabels: + description: |- + The source labels select values from existing labels. Their content is + concatenated using the configured Separator and matched against the + configured regular expression. + items: + description: |- + LabelName is a valid Prometheus label name which may only contain ASCII + letters, numbers, as well as underscores. + pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$ + type: string + type: array + targetLabel: + description: |- + Label to which the resulting string is written in a replacement. + + It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`, + `KeepEqual` and `DropEqual` actions. + + Regex capture groups are available. + type: string + type: object + type: array + tlsConfig: + description: TLS settings used by Prometheus to connect + to the metrics endpoint + properties: + ca: + description: Certificate authority used when verifying + server certificates. + properties: + configMap: + description: ConfigMap containing data to use for + the targets. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or + its key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + secret: + description: Secret containing data to use for the + targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + caFile: + description: Path to the CA cert in the Prometheus container + to use for the targets. + type: string + cert: + description: Client certificate to present when doing + client-authentication. + properties: + configMap: + description: ConfigMap containing data to use for + the targets. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or + its key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + secret: + description: Secret containing data to use for the + targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + certFile: + description: Path to the client cert file in the Prometheus + container for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: Path to the client key file in the Prometheus + container for the targets. + type: string + keySecret: + description: Secret containing the client key file for + the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + maxVersion: + description: |- + Maximum acceptable TLS version. + + It requires Prometheus >= v2.41.0. + enum: + - TLS10 + - TLS11 + - TLS12 + - TLS13 + type: string + minVersion: + description: |- + Minimum acceptable TLS version. + + It requires Prometheus >= v2.35.0. + enum: + - TLS10 + - TLS11 + - TLS12 + - TLS13 + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + type: object + type: object + rbacConfig: + description: optional kube-rbac-proxy config to provide rbac services + properties: + clientCAConfigMap: + description: 'Reference to a configmap containing the client + CA (key: ca.crt) for mTLS client validation' + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + disableHttps: + description: disable https protecting the proxy endpoint + type: boolean + enable: + description: enable kube-rbac-proxy, disabled by default + type: boolean + image: + description: kube-rbac-proxy image + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + secret: + description: certificate secret to mount in kube-rbac container + for TLS, self signed certificates will be generated by default + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + staticAuthorization: + description: Optional static RBAC rules based on client certificate + Common Name (CN) + properties: + clientName: + description: Expected CN (Common Name) from client cert + (e.g., Prometheus SA identity) + type: string + enable: + description: Enables static authorization using client certificate + CN + type: boolean + type: object + type: object + selector: + additionalProperties: + type: string + description: Selector describes on which nodes to enable metrics + exporter + type: object + serviceType: + default: ClusterIP + description: ServiceType service type for metrics, clusterIP/NodePort, + clusterIP by default + enum: + - ClusterIP + - NodePort + type: string + tolerations: + description: tolerations for metrics exporter + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + upgradePolicy: + description: upgrade policy for metrics exporter daemons + properties: + maxUnavailable: + default: 1 + description: MaxUnavailable specifies the maximum number of + Pods that can be unavailable during the update process. Applicable + for RollingUpdate only. Default value is 1. + format: int32 + type: integer + upgradeStrategy: + description: UpgradeStrategy specifies the type of the DaemonSet + update. Valid values are "RollingUpdate" (default) or "OnDelete". + enum: + - RollingUpdate + - OnDelete + type: string + type: object + type: object + selector: + additionalProperties: + type: string + description: Selector describes on which nodes the GPU Operator should + enable the GPU device. + type: object + testRunner: + description: test runner + properties: + config: + description: config map to customize the config for test runner, + if not specified default test config will be aplied + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + enable: + description: enable test runner, disabled by default + type: boolean + image: + description: test runner image + pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$ + type: string + imagePullPolicy: + description: image pull policy for test runner + enum: + - Always + - IfNotPresent + - Never + type: string + imageRegistrySecret: + description: test runner image registry secret used to pull/push + images + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + logsLocation: + description: captures logs location and export config for test runner + logs + properties: + hostPath: + default: /var/log/amd-test-runner + description: host path to store test runner internal status + db in order to persist test running status + type: string + logsExportSecrets: + description: LogsExportSecrets is a list of secrets that contain + connectivity info to multiple cloud providers + items: + description: |- + LocalObjectReference contains enough information to let you locate the + referenced object inside the same namespace. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + type: array + mountPath: + default: /var/log/amd-test-runner + description: volume mount destination within test runner container + type: string + type: object + selector: + additionalProperties: + type: string + description: Selector describes on which nodes to enable test runner + type: object + tolerations: + description: tolerations for test runner + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + upgradePolicy: + description: upgrade policy for test runner daemonset + properties: + maxUnavailable: + default: 1 + description: MaxUnavailable specifies the maximum number of + Pods that can be unavailable during the update process. Applicable + for RollingUpdate only. Default value is 1. + format: int32 + type: integer + upgradeStrategy: + description: UpgradeStrategy specifies the type of the DaemonSet + update. Valid values are "RollingUpdate" (default) or "OnDelete". + enum: + - RollingUpdate + - OnDelete + type: string + type: object + type: object + type: object + status: + description: DeviceConfigStatus defines the observed state of Module. + properties: + conditions: + description: Conditions list the current status of the DeviceConfig + object + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + configManager: + description: ConfigManager contains the status of the ConfigManager + deployment + properties: + availableNumber: + description: number of the actually deployed and running pods + format: int32 + type: integer + desiredNumber: + description: number of the pods that should be deployed for daemonset + format: int32 + type: integer + nodesMatchingSelectorNumber: + description: number of nodes that are targeted by the DeviceConfig + selector + format: int32 + type: integer + type: object + devicePlugin: + description: DevicePlugin contains the status of the Device Plugin deployment + properties: + availableNumber: + description: number of the actually deployed and running pods + format: int32 + type: integer + desiredNumber: + description: number of the pods that should be deployed for daemonset + format: int32 + type: integer + nodesMatchingSelectorNumber: + description: number of nodes that are targeted by the DeviceConfig + selector + format: int32 + type: integer + type: object + driver: + description: Driver contains the status of the Drivers deployment + properties: + availableNumber: + description: number of the actually deployed and running pods + format: int32 + type: integer + desiredNumber: + description: number of the pods that should be deployed for daemonset + format: int32 + type: integer + nodesMatchingSelectorNumber: + description: number of nodes that are targeted by the DeviceConfig + selector + format: int32 + type: integer + type: object + metricsExporter: + description: MetricsExporter contains the status of the MetricsExporter + deployment + properties: + availableNumber: + description: number of the actually deployed and running pods + format: int32 + type: integer + desiredNumber: + description: number of the pods that should be deployed for daemonset + format: int32 + type: integer + nodesMatchingSelectorNumber: + description: number of nodes that are targeted by the DeviceConfig + selector + format: int32 + type: integer + type: object + nodeModuleStatus: + additionalProperties: + description: ModuleStatus contains the status of driver module installed + by operator on the node + properties: + bootId: + type: string + containerImage: + type: string + kernelVersion: + type: string + lastTransitionTime: + type: string + status: + description: UpgradeState captures the state of the upgrade process + on a node + type: string + upgradeStartTime: + type: string + type: object + description: NodeModuleStatus contains per node status of driver module + installation + type: object + observedGeneration: + description: ObservedGeneration is the latest spec generation successfully + processed by the controller + format: int64 + type: integer + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/templates/_helpers.tpl b/templates/_helpers.tpl new file mode 100644 index 0000000..10e2464 --- /dev/null +++ b/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "helm-charts-k8s.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "helm-charts-k8s.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "helm-charts-k8s.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "helm-charts-k8s.labels" -}} +helm.sh/chart: {{ include "helm-charts-k8s.chart" . }} +{{ include "helm-charts-k8s.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "helm-charts-k8s.selectorLabels" -}} +app.kubernetes.io/name: {{ include "helm-charts-k8s.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "helm-charts-k8s.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "helm-charts-k8s.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/templates/config-manager-rbac.yaml b/templates/config-manager-rbac.yaml new file mode 100644 index 0000000..212deda --- /dev/null +++ b/templates/config-manager-rbac.yaml @@ -0,0 +1,74 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-config-manager + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - get + - list + - update +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - update +- apiGroups: + - "apps" + resources: + - daemonsets + verbs: + - get + - list + - watch + - delete + - create + - update +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch + - delete + - create + - update +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - nodemodulesconfigs + - nodemodulesconfigs/status + verbs: + - delete + - get +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-config-manager + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-config-manager' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-config-manager + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/templates/default-deviceconfig.yaml b/templates/default-deviceconfig.yaml new file mode 100644 index 0000000..a5cbaf7 --- /dev/null +++ b/templates/default-deviceconfig.yaml @@ -0,0 +1,393 @@ + +{{- if or (and .Release.IsInstall .Values.crds.defaultCR.install) (and .Release.IsUpgrade .Values.crds.defaultCR.upgrade) }} +{{- if and (hasKey .Values "deviceConfig") (hasKey .Values.deviceConfig "spec") }} +apiVersion: amd.com/v1alpha1 +kind: DeviceConfig +metadata: + name: default + # the default CR cleanup is handled by pre-delete hook + # add this annotation so that helm won't try to delete the default DeviceConfig twice + annotations: + "helm.sh/resource-policy": keep +spec: + {{- with .Values.deviceConfig.spec.selector }} + selector: + {{- toYaml . | nindent 4 }} + {{- end }} + + {{- with .Values.deviceConfig.spec.driver }} + driver: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- if (hasKey . "blacklist") }} + blacklist: {{ .blacklist }} + {{- end }} + + {{- with .driverType }} + driverType: {{ . }} + {{- end }} + + {{- with .vfioConfig }} + vfioConfig: + {{- with .deviceIDs }} + deviceIDs: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + + {{- with .kernelModuleConfig }} + kernelModuleConfig: + {{- with .loadArgs }} + loadArgs: + {{- toYaml . | nindent 8 }} + {{- end }} + + {{- with .unloadArgs }} + unloadArgs: + {{- toYaml . | nindent 8 }} + {{- end }} + + {{- with .parameters }} + parameters: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + + {{- with .image }} + image: {{ . }} + {{- end }} + + {{- with .imageRegistrySecret }} + imageRegistrySecret: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .imageRegistryTLS }} + imageRegistryTLS: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .version }} + version: {{ quote . }} + {{- end }} + + {{- with .imageSign }} + imageSign: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .imageBuild }} + imageBuild: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .tolerations }} + tolerations: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .upgradePolicy }} + upgradePolicy: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- end }} + + {{- with .Values.deviceConfig.spec.commonConfig }} + commonConfig: + {{- with .initContainerImage }} + initContainerImage: {{ . }} + {{- end }} + + {{- with .utilsContainer }} + utilsContainer: + {{- with .image }} + image: {{ . }} + {{- end }} + + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + + {{- with .imageRegistrySecret }} + imageRegistrySecret: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .Values.deviceConfig.spec.devicePlugin }} + devicePlugin: + {{- with .devicePluginImage }} + devicePluginImage: {{ . }} + {{- end }} + + {{- with .devicePluginImagePullPolicy }} + devicePluginImagePullPolicy: {{ . }} + {{- end }} + + {{- with .devicePluginTolerations }} + devicePluginTolerations: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .devicePluginArguments }} + devicePluginArguments: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- if (hasKey . "enableNodeLabeller") }} + enableNodeLabeller: {{ .enableNodeLabeller }} + {{- end }} + + {{- with .nodeLabellerImage }} + nodeLabellerImage: {{ . }} + {{- end }} + + {{- with .nodeLabellerImagePullPolicy }} + nodeLabellerImagePullPolicy: {{ . }} + {{- end }} + + {{- with .nodeLabellerTolerations }} + nodeLabellerTolerations: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .nodeLabellerArguments }} + nodeLabellerArguments: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .imageRegistrySecret }} + imageRegistrySecret: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .upgradePolicy }} + upgradePolicy: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- end }} + + {{- with .Values.deviceConfig.spec.metricsExporter }} + metricsExporter: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- with .serviceType }} + serviceType: {{ . }} + {{- end }} + + {{- if (hasKey . "port") }} + port: {{ .port }} + {{- end }} + + {{- if (hasKey . "nodePort") }} + nodePort: {{ .nodePort }} + {{- end }} + + {{- with .image }} + image: {{ . }} + {{- end }} + + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + + {{- with .config }} + config: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .tolerations }} + tolerations: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .imageRegistrySecret }} + imageRegistrySecret: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .selector }} + selector: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .upgradePolicy }} + upgradePolicy: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .rbacConfig }} + rbacConfig: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- with .image }} + image: {{ . }} + {{- end }} + + {{- if (hasKey . "disableHttps")}} + disableHttps: {{ .disableHttps }} + {{- end }} + + {{- with .secret }} + secret: + {{- toYaml . | nindent 8 }} + {{- end }} + + {{- with .clientCAConfigMap }} + clientCAConfigMap: + {{- toYaml . | nindent 8 }} + {{- end }} + + {{- with .staticAuthorization }} + staticAuthorization: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + + {{- with .prometheus }} + prometheus: + {{- with .serviceMonitor }} + serviceMonitor: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- if (hasKey . "interval") }} + interval: {{ .interval }} + {{- end }} + + {{- with .attachMetadata }} + attachMetadata: + {{- toYaml . | nindent 10 }} + {{- end }} + + {{- if (hasKey . "honorLabels") }} + honorLabels: {{ .honorLabels }} + {{- end }} + + {{- if (hasKey . "honorTimestamps") }} + honorTimestamps: {{ .honorTimestamps }} + {{- end }} + + {{- with .labels }} + labels: + {{- toYaml . | nindent 10 }} + {{- end }} + + {{- with .relabelings }} + relabelings: + {{- toYaml . | nindent 10 }} + {{- end }} + + {{- with .metricRelabelings }} + metricRelabelings: + {{- toYaml . | nindent 10 }} + {{- end }} + + {{- with .authorization }} + authorization: + {{- toYaml . | nindent 10 }} + {{- end }} + + {{- with .tlsConfig }} + tlsConfig: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .Values.deviceConfig.spec.testRunner }} + testRunner: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- with .image }} + image: {{ . }} + {{- end }} + + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + + {{- with .config }} + config: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .logsLocation }} + logsLocation: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .upgradePolicy }} + upgradePolicy: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .tolerations }} + tolerations: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .imageRegistrySecret }} + imageRegistrySecret: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .selector }} + selector: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- end }} + + {{- with .Values.deviceConfig.spec.configManager }} + configManager: + {{- if (hasKey . "enable") }} + enable: {{ .enable }} + {{- end }} + + {{- with .image }} + image: {{ . }} + {{- end }} + + {{- with .imagePullPolicy }} + imagePullPolicy: {{ . }} + {{- end }} + + {{- with .imageRegistrySecret }} + imageRegistrySecret: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .config }} + config: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .selector }} + selector: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .upgradePolicy }} + upgradePolicy: + {{- toYaml . | nindent 6 }} + {{- end }} + + {{- with .configManagerTolerations }} + configManagerTolerations: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- end }} + +{{- end }} +{{- end }} diff --git a/templates/deployment.yaml b/templates/deployment.yaml new file mode 100644 index 0000000..6397c24 --- /dev/null +++ b/templates/deployment.yaml @@ -0,0 +1,83 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-controller-manager + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + control-plane: controller-manager + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.controllerManager.replicas }} + selector: + matchLabels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + control-plane: controller-manager + {{- include "helm-charts-k8s.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + control-plane: controller-manager + {{- include "helm-charts-k8s.selectorLabels" . | nindent 8 }} + annotations: + kubectl.kubernetes.io/default-container: manager + spec: + {{- with .Values.controllerManager.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + nodeSelector: {{- toYaml .Values.controllerManager.nodeSelector | nindent 8 }} + containers: + - args: {{- toYaml .Values.controllerManager.manager.args | nindent 8 }} + env: + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: KUBERNETES_CLUSTER_DOMAIN + value: {{ quote .Values.kubernetesClusterDomain }} + - name: SIM_ENABLE + value: {{ quote .Values.controllerManager.env.simEnable }} + image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag + | default .Chart.AppVersion }} + imagePullPolicy: {{ .Values.controllerManager.manager.imagePullPolicy }} + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + name: manager + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: {{- toYaml .Values.controllerManager.manager.resources | nindent 10 + }} + securityContext: {{- toYaml .Values.controllerManager.manager.containerSecurityContext + | nindent 10 }} + volumeMounts: + - mountPath: /controller_manager_config.yaml + name: manager-config + subPath: controller_manager_config.yaml + {{- if .Values.controllerManager.manager.imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.controllerManager.manager.imagePullSecrets }} + {{- end}} + securityContext: + runAsNonRoot: true + serviceAccountName: {{ include "helm-charts-k8s.fullname" . }}-controller-manager + terminationGracePeriodSeconds: 10 + {{- with .Values.controllerManager.manager.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - configMap: + name: {{ include "helm-charts-k8s.fullname" . }}-manager-config + name: manager-config diff --git a/templates/event-recorder-clusterrole-rbac.yaml b/templates/event-recorder-clusterrole-rbac.yaml new file mode 100644 index 0000000..c2e2b41 --- /dev/null +++ b/templates/event-recorder-clusterrole-rbac.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-event-recorder-clusterrole + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch \ No newline at end of file diff --git a/templates/event-recorder-clusterrolebinding-rbac.yaml b/templates/event-recorder-clusterrolebinding-rbac.yaml new file mode 100644 index 0000000..ffc1c06 --- /dev/null +++ b/templates/event-recorder-clusterrolebinding-rbac.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-event-recorder-clusterrolebinding + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-event-recorder-clusterrole' +subjects: +- kind: ServiceAccount + name: '{{ include "helm-charts-k8s.fullname" . }}-controller-manager' + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/templates/gpu-nfd-default-rule.yaml b/templates/gpu-nfd-default-rule.yaml new file mode 100644 index 0000000..f626f85 --- /dev/null +++ b/templates/gpu-nfd-default-rule.yaml @@ -0,0 +1,217 @@ +{{- if .Values.installdefaultNFDRule }} +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: amd-gpu-label-nfd-rule + # the PCI info is from these websites: + # source1: https://admin.pci-ids.ucw.cz/read/PC/1002 + # source2: https://devicehunt.com/view/type/pci/vendor/1002 +spec: + rules: + - name: amd-vgpu + labels: + feature.node.kubernetes.io/amd-vgpu: "true" + matchAny: + # AMD Instinct + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["7410"]} # MI210 VF + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74b5"]} # MI300X VF + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74b9"]} # Mi325X VF + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["75b0"]} # Mi350X VF + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["75b3"]} # Mi355X VF + # AMD Radeon Pro + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["7461"]} # Radeon Pro V710 MxGPU + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["73ae"]} # Radeon Pro V620 MxGPU + - name: amd-gpu + labels: + feature.node.kubernetes.io/amd-gpu: "true" + matchAny: + # AMD Instinct + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["75a3"]} # MI355X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["75a0"]} # MI350X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74a5"]} # MI325X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74a2"]} # MI308X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74b6"]} # MI308X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74a8"]} # MI308X HF + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74a0"]} # MI300A + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74a1"]} # MI300X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74a9"]} # MI300X HF + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74bd"]} # MI300X HF + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["740f"]} # MI210 + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["7408"]} # MI250X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["740c"]} # MI250/MI250X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["738c"]} # MI100 + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["738e"]} # MI100 + # AMD Radeon Pro + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["7460"]} # V710 + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["7448"]} # W7900 + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["744a"]} # W7900 Dual Slot + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["7449"]} # W7800 48GB + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["745e"]} # W7800 + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["73a2"]} # W6900X + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["73a3"]} # W6800 GL-XL + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["73ab"]} # W6800X / W6800X Duo + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["73a1"]} # V620 + # AMD Radeon + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["7550"]} # RX 9070 / 9070 XT + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["744c"]} # RX 7900 XT / 7900 XTX / 7900 GRE / 7900M + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["73af"]} # RX 6900 XT + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["73bf"]} # RX 6800 / 6800 XT / 6900 XT + - name: amd-gpu-mi210 + labels: + feature.node.kubernetes.io/amd-gpu-mi210: "true" + matchAny: + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["740f"]} # MI210 + - name: amd-gpu-mi300x + labels: + feature.node.kubernetes.io/amd-gpu-mi300x: "true" + matchAny: + - matchFeatures: + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1002"]} + device: {op: In, value: ["74a1"]} # MI300X +{{- end }} diff --git a/templates/leader-election-rbac.yaml b/templates/leader-election-rbac.yaml new file mode 100644 index 0000000..b852c75 --- /dev/null +++ b/templates/leader-election-rbac.yaml @@ -0,0 +1,50 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-leader-election-role + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-leader-election-rolebinding + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: '{{ include "helm-charts-k8s.fullname" . }}-leader-election-role' +subjects: +- kind: ServiceAccount + name: '{{ include "helm-charts-k8s.fullname" . }}-controller-manager' + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/templates/manager-config.yaml b/templates/manager-config.yaml new file mode 100644 index 0000000..3166446 --- /dev/null +++ b/templates/manager-config.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-manager-config + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +data: + controller_manager_config.yaml: {{ .Values.managerConfig.controllerManagerConfigYaml + | toYaml | indent 1 }} \ No newline at end of file diff --git a/templates/manager-rbac.yaml b/templates/manager-rbac.yaml new file mode 100644 index 0000000..a09a119 --- /dev/null +++ b/templates/manager-rbac.yaml @@ -0,0 +1,216 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-manager-role + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + - services + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - patch + - watch +- apiGroups: + - "" + resources: + - nodes/finalizers + - nodes/status + verbs: + - get + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - create + - delete + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods/eviction + verbs: + - create + - delete + - get + - list +- apiGroups: + - "" + resources: + - pods/finalizers + - pods/status + verbs: + - delete + - get + - list + - watch +- apiGroups: + - "" + resources: + - services/finalizers + verbs: + - create + - get + - update + - watch +- apiGroups: + - amd.com + resources: + - deviceconfigs + verbs: + - create + - get + - list + - patch + - update + - watch +- apiGroups: + - amd.com + resources: + - deviceconfigs/finalizers + verbs: + - update +- apiGroups: + - amd.com + resources: + - deviceconfigs/status + verbs: + - get + - patch + - update +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - delete + - get + - list + - watch +- apiGroups: + - apps + resources: + - daemonsets + - daemonsets/status + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - apps + resources: + - daemonsets/finalizers + verbs: + - create + - get + - update + - watch +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - modules + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - modules/finalizers + - nodemodulesconfigs/finalizers + verbs: + - get + - update + - watch +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - modules/status + verbs: + - get + - patch + - update +- apiGroups: + - kmm.sigs.x-k8s.io + resources: + - nodemodulesconfigs + - nodemodulesconfigs/status + verbs: + - get + - list + - watch +- apiGroups: + - monitoring.coreos.com + resources: + - servicemonitors + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - nfd.openshift.io + resources: + - nodefeaturediscoveries + verbs: + - delete + - get + - list +- apiGroups: + - nfd.openshift.io + resources: + - nodefeaturediscoveries/finalizers + - nodefeaturediscoveries/status + verbs: + - get + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-manager-rolebinding + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-manager-role' +subjects: +- kind: ServiceAccount + name: '{{ include "helm-charts-k8s.fullname" . }}-controller-manager' + namespace: '{{ .Release.Namespace }}' \ No newline at end of file diff --git a/templates/metrics-exporter-rbac-proxy-rbac.yaml b/templates/metrics-exporter-rbac-proxy-rbac.yaml new file mode 100644 index 0000000..ae52b5b --- /dev/null +++ b/templates/metrics-exporter-rbac-proxy-rbac.yaml @@ -0,0 +1,55 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-metrics-exporter-rbac-proxy + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - watch + - get + - list + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-metrics-exporter-rbac-proxy + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-metrics-exporter-rbac-proxy' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-metrics-exporter-rbac-proxy + namespace: '{{ .Release.Namespace }}' diff --git a/templates/metrics-exporter-rbac.yaml b/templates/metrics-exporter-rbac.yaml new file mode 100644 index 0000000..f241cbc --- /dev/null +++ b/templates/metrics-exporter-rbac.yaml @@ -0,0 +1,43 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-metrics-exporter + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - watch + - get + - list + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-metrics-exporter + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-metrics-exporter' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-metrics-exporter + namespace: '{{ .Release.Namespace }}' diff --git a/templates/nic-nfd-default-rule.yaml b/templates/nic-nfd-default-rule.yaml new file mode 100644 index 0000000..da4251d --- /dev/null +++ b/templates/nic-nfd-default-rule.yaml @@ -0,0 +1,39 @@ +{{- if .Values.installdefaultNFDRule }} +apiVersion: nfd.k8s-sigs.io/v1alpha1 +kind: NodeFeatureRule +metadata: + name: amd-nic-label-nfd-rule + # the PCI info is from these websites: + # source1: https://admin.pci-ids.ucw.cz/read/PC/1dd8 + # source2: https://devicehunt.com/view/type/pci/vendor/1dd8 +spec: + rules: + - name: amd-vnic + labels: + feature.node.kubernetes.io/amd-vnic: "true" + matchAny: + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + ionic: {op: Exists} + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1dd8"]} # AMD Pensando Systems + device: {op: In, value: ["1003"]} # DSC Ethernet Controller VF + subsystem_vendor: {op: In, value: ["1dd8"]} + subsystem_device: {op: In, value: ["5201"]} # POLLARA-1Q400 100/200/400G 1-port Card + - name: amd-nic + labels: + feature.node.kubernetes.io/amd-nic: "true" + matchAny: + - matchFeatures: + - feature: kernel.loadedmodule + matchExpressions: + ionic: {op: Exists} + - feature: pci.device + matchExpressions: + vendor: {op: In, value: ["1dd8"]} # AMD Pensando Systems + device: {op: In, value: ["1002"]} # DSC Ethernet Controller + subsystem_vendor: {op: In, value: ["1dd8"]} + subsystem_device: {op: In, value: ["5201"]} # POLLARA-1Q400 100/200/400G 1-port Card +{{- end }} diff --git a/templates/node-labeller-rbac.yaml b/templates/node-labeller-rbac.yaml new file mode 100644 index 0000000..7802407 --- /dev/null +++ b/templates/node-labeller-rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-node-labeller + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - nodes + verbs: + - watch + - get + - list + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-node-labeller + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-node-labeller' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-node-labeller + namespace: '{{ .Release.Namespace }}' diff --git a/templates/post-delete-hook.yaml b/templates/post-delete-hook.yaml new file mode 100644 index 0000000..ad54a95 --- /dev/null +++ b/templates/post-delete-hook.yaml @@ -0,0 +1,117 @@ +# Run helm uninstall with --no-hooks to bypass the post-delete hook +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-prune + namespace: {{ .Release.Namespace }} + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "0" + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-prune + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "0" + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - delete + - get + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-prune + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "1" + "helm.sh/hook": post-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "helm-charts-k8s.fullname" . }}-prune +subjects: +- kind: ServiceAccount + name: {{ include "helm-charts-k8s.fullname" . }}-prune + namespace: {{ .Release.Namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: delete-custom-resource-definitions + namespace: {{ .Release.Namespace }} + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "2" + # hook will be executed after helm uninstall + "helm.sh/hook": post-delete + # remove the resource created by the hook whether it succeeded or failed + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded +spec: + backoffLimit: 0 # once the job finished first run, don't retry to create another pod + ttlSecondsAfterFinished: 60 # job info will be kept for 1 min then deleted + template: + spec: + serviceAccountName: {{ include "helm-charts-k8s.fullname" . }}-prune + containers: + - name: delete-custom-resource-definitions + image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag }} + command: + - /bin/sh + - -c + - | + if kubectl get crds deviceconfigs.amd.com > /dev/null 2>&1; then + kubectl delete crds deviceconfigs.amd.com + fi + {{- if index .Values "node-feature-discovery" "enabled" }} + if kubectl get crds nodefeaturegroups.nfd.k8s-sigs.io > /dev/null 2>&1; then + kubectl delete crds nodefeaturegroups.nfd.k8s-sigs.io + fi + if kubectl get crds nodefeaturerules.nfd.k8s-sigs.io > /dev/null 2>&1; then + kubectl delete crds nodefeaturerules.nfd.k8s-sigs.io + fi + if kubectl get crds nodefeatures.nfd.k8s-sigs.io > /dev/null 2>&1; then + kubectl delete crds nodefeatures.nfd.k8s-sigs.io + fi + {{- end }} + {{- if .Values.kmm.enabled }} + if kubectl get crds modules.kmm.sigs.x-k8s.io > /dev/null 2>&1; then + kubectl delete crds modules.kmm.sigs.x-k8s.io + fi + if kubectl get crds nodemodulesconfigs.kmm.sigs.x-k8s.io > /dev/null 2>&1; then + kubectl delete crds nodemodulesconfigs.kmm.sigs.x-k8s.io + fi + {{- end }} + {{- if .Values.controllerManager.manager.imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.controllerManager.manager.imagePullSecrets }} + {{- end }} + {{- with .Values.controllerManager.manager.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controllerManager.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + restartPolicy: Never diff --git a/templates/pre-delete-hook.yaml b/templates/pre-delete-hook.yaml new file mode 100644 index 0000000..381e78d --- /dev/null +++ b/templates/pre-delete-hook.yaml @@ -0,0 +1,101 @@ +# Run helm uninstall with --no-hooks to bypass the pre-delete hook +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-pre-delete + namespace: {{ .Release.Namespace }} + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "0" + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-pre-delete + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "0" + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +rules: + - apiGroups: + - amd.com + resources: + - deviceconfigs + verbs: + - get + - list + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-pre-delete + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "1" + "helm.sh/hook": pre-delete + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "helm-charts-k8s.fullname" . }}-pre-delete +subjects: +- kind: ServiceAccount + name: {{ include "helm-charts-k8s.fullname" . }}-pre-delete + namespace: {{ .Release.Namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: delete-leftover-deviceconfigs + namespace: {{ .Release.Namespace }} + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "2" + # hook will be executed before helm uninstall + "helm.sh/hook": pre-delete + # remove the resource created by the hook whether it succeeded or failed + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded +spec: + backoffLimit: 0 # once the job finished first run, don't retry to create another pod + ttlSecondsAfterFinished: 60 # job info will be kept for 1 min then deleted + template: + spec: + serviceAccountName: {{ include "helm-charts-k8s.fullname" . }}-pre-delete + containers: + - name: delete-leftover-deviceconfigs + image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag }} + command: + - /bin/sh + - -c + - | + installed=$(kubectl api-resources -owide | grep -i amd.com | grep -i deviceconfig) + if [ -z ${installed} ] ; then + exit 0 + fi + # Delete all existing DeviceConfig custom resources + kubectl delete deviceconfigs.amd.com --all -A + {{- if .Values.controllerManager.manager.imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.controllerManager.manager.imagePullSecrets }} + {{- end}} + {{- with .Values.controllerManager.manager.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controllerManager.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + restartPolicy: Never diff --git a/templates/pre-upgrade-hook.yaml b/templates/pre-upgrade-hook.yaml new file mode 100644 index 0000000..5a30e9d --- /dev/null +++ b/templates/pre-upgrade-hook.yaml @@ -0,0 +1,229 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: pre-upgrade-check-sa + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "0" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: pre-upgrade-check-cluster-role + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "0" +rules: + - apiGroups: + - amd.com + resources: + - deviceconfigs + verbs: + - list + - get +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pre-upgrade-check-cluster-role-binding + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "1" +subjects: + - kind: ServiceAccount + name: pre-upgrade-check-sa + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: pre-upgrade-check-cluster-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: pre-upgrade-check + namespace: {{ .Release.Namespace }} + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "2" +spec: + backoffLimit: 0 # once the job finished first run, don't retry to create another pod + ttlSecondsAfterFinished: 60 # job info will be kept for 1 min then deleted + template: + spec: + serviceAccountName: pre-upgrade-check-sa + containers: + - name: pre-upgrade-check + image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag }} + command: + - /bin/sh + - -c + - | + # Ignore the lack of CRDs, probably haven't actually been installed yet + # this provides idempotentcy when "things" don't understand the difference between + # install and upgrade. E.g. Argo turns pre-upgrade hook into its PreSync hook + installed=$(kubectl api-resources -owide | grep -i amd.com | grep -i deviceconfig) + if [ -z ${installed} ] ; then + exit 0 + fi + + # List all DeviceConfig CRs + deviceconfigs=$(kubectl get deviceconfigs -n {{ .Release.Namespace }} -o json) + + echo "DeviceConfigs JSON:" + echo "$deviceconfigs" | jq . + + # Check if any UpgradeState is in the blocked states + blocked_states='["Upgrade-Not-Started", "Upgrade-Started", "Install-In-Progress", "Upgrade-In-Progress"]' + if echo "$deviceconfigs" | jq --argjson blocked_states "$blocked_states" -e ' + .items[] | + .status.nodeModuleStatus // {} | + to_entries | + any(.value.status as $state | ($blocked_states | index($state)))' > /dev/null; then + echo "Upgrade blocked: Some DeviceConfigs are in a disallowed UpgradeState." + exit 1 + else + echo "All DeviceConfigs are in an allowed state. Proceeding with upgrade." + exit 0 + fi + {{- if .Values.controllerManager.manager.imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.controllerManager.manager.imagePullSecrets }} + {{- end }} + {{- with .Values.controllerManager.manager.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controllerManager.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + restartPolicy: Never +{{- if .Values.upgradeCRD }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: upgrade-crd-hook-sa + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "1" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: upgrade-crd-hook-cluster-role + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "1" +rules: + - apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - create + - get + - list + - watch + - patch + - update +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: upgrade-crd-hook-cluster-role-binding + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "2" +subjects: + - kind: ServiceAccount + name: upgrade-crd-hook-sa + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: upgrade-crd-hook-cluster-role + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: upgrade-crd + namespace: {{ .Release.Namespace }} + labels: + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + # hook will be executed before helm upgrade + "helm.sh/hook": pre-upgrade,pre-rollback + # don't cleanup the job on hook failure + "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded + # hook with lower weight value will run firstly + "helm.sh/hook-weight": "3" +spec: + template: + metadata: + name: upgrade-crd + spec: + serviceAccountName: upgrade-crd-hook-sa + {{- if .Values.controllerManager.manager.imagePullSecrets }} + imagePullSecrets: + - name: {{ .Values.controllerManager.manager.imagePullSecrets }} + {{- end }} + {{- with .Values.controllerManager.manager.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.controllerManager.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: upgrade-crd + image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag }} + imagePullPolicy: {{ .Values.controllerManager.manager.imagePullPolicy }} + command: + - /bin/sh + - -c + - | + kubectl apply -f /opt/helm-charts-crds-k8s/deviceconfig-crd.yaml + {{- if index .Values "node-feature-discovery" "enabled" }} + kubectl apply -f /opt/helm-charts-crds-k8s/nfd-api-crds.yaml + {{- end }} + {{- if .Values.kmm.enabled }} + kubectl apply -f /opt/helm-charts-crds-k8s/module-crd.yaml + kubectl apply -f /opt/helm-charts-crds-k8s/nodemodulesconfig-crd.yaml + {{- end }} + restartPolicy: OnFailure +{{- end }} +# Run helm upgrade with --no-hooks to bypass the pre-upgrade hook \ No newline at end of file diff --git a/templates/serviceaccount.yaml b/templates/serviceaccount.yaml new file mode 100644 index 0000000..ae6c0be --- /dev/null +++ b/templates/serviceaccount.yaml @@ -0,0 +1,98 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-controller-manager + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.controllerManager.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-kmm-device-plugin + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.kmmDevicePlugin.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-kmm-module-loader + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.kmmModuleLoader.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-node-labeller + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.nodeLabeller.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-metrics-exporter + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.metricsExporter.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-metrics-exporter-rbac-proxy + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.metricsExporter.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-test-runner + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.testRunner.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-config-manager + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.configManager.serviceAccount.annotations | nindent 4 }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: amd-gpu-operator-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} + annotations: + {{- toYaml .Values.utilsContainer.serviceAccount.annotations | nindent 4 }} \ No newline at end of file diff --git a/templates/test-runner-rbac.yaml b/templates/test-runner-rbac.yaml new file mode 100644 index 0000000..21e7b39 --- /dev/null +++ b/templates/test-runner-rbac.yaml @@ -0,0 +1,41 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-test-runner + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - create + - get + - list + - update +- apiGroups: + - "" + resources: + - nodes + verbs: + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-test-runner + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-test-runner' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-test-runner + namespace: '{{ .Release.Namespace }}' diff --git a/templates/utils-container-rbac.yaml b/templates/utils-container-rbac.yaml new file mode 100644 index 0000000..99ee6d4 --- /dev/null +++ b/templates/utils-container-rbac.yaml @@ -0,0 +1,34 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - privileged + resources: + - securitycontextconstraints + verbs: + - use +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "helm-charts-k8s.fullname" . }}-utils-container + labels: + app.kubernetes.io/component: amd-gpu + app.kubernetes.io/part-of: amd-gpu + {{- include "helm-charts-k8s.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: '{{ include "helm-charts-k8s.fullname" . }}-utils-container' +subjects: +- kind: ServiceAccount + name: amd-gpu-operator-utils-container + namespace: '{{ .Release.Namespace }}' diff --git a/values.yaml b/values.yaml new file mode 100644 index 0000000..af6190a --- /dev/null +++ b/values.yaml @@ -0,0 +1,312 @@ +# NFD related configs +# schema reference: https://github.com/kubernetes-sigs/node-feature-discovery/blob/release-0.16/deployment/helm/node-feature-discovery/values.yaml +node-feature-discovery: + # -- Set to true/false to enable/disable the installation of node feature discovery (NFD) operator + enabled: true + worker: + # -- Set tolerations for NFD worker daemonset + tolerations: + - key: "amd-dcm" + operator: "Equal" + value: "up" + effect: "NoExecute" + # -- Set nodeSelector for NFD worker daemonset + nodeSelector: {} +# KMM related configs +kmm: + # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator + enabled: true +# -- Default NFD rule will detect amd gpu based on pci vendor ID +installdefaultNFDRule: true +# -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart +upgradeCRD: true +crds: + defaultCR: + # -- Deploy default DeviceConfig during helm chart installation + install: true + # -- Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig + upgrade: false +deviceConfig: + spec: + # -- Set node selector for the default DeviceConfig + selector: + feature.node.kubernetes.io/amd-gpu: "true" + driver: + # -- enable/disable out-of-tree driver management, set to false to use inbox driver + enable: false + # -- enable/disable putting a blacklist amdgpu entry in modprobe config, which requires node labeller to run + blacklist: false + # -- image repository to store out-of-tree driver image, DO NOT put image tag since operator automatically manage it for users + image: "docker.io/myUserName/driverImage" + # -- image pull secret for pull/push access of the driver image repository, input secret name like {"name": "mysecret"} + imageRegistrySecret: {} + imageRegistryTLS: + # -- set to true to use plain HTTP for driver image repository + insecure: false + # -- set to true to skip TLS validation for driver image repository + insecureSkipTLSVerify: false + # -- specify an out-of-tree driver version to install + version: "6.4" + # -- specify the secrets to sign the out-of-tree kernel module inside driver image for secure boot, e.g. input private / public key secret {"keySecret":{"name":"privateKeySecret"},"certSecret":{"name":"publicKeySecret"}} + imageSign: {} + # -- configure the out-of-tree driver image build within the cluster. e.g. {"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"baseImageRegistry":"docker.io","baseImageRegistryTLS":{"insecure":"false","insecureSkipTLSVerify":"false"}}} + imageBuild: {} + # -- configure driver tolerations so that operator can manage out-of-tree drivers on tainted nodes + tolerations: [] + upgradePolicy: + # -- enable/disable automatic driver upgrade feature + enable: true + # -- how many nodes can be upgraded in parallel + maxParallelUpgrades: 3 + # -- maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state + maxUnavailableNodes: 25% + # -- whether reboot each worker node or not during the driver upgrade + rebootRequired: true + nodeDrainPolicy: + # -- whether force draining is allowed or not + force: true + # -- the length of time in seconds to wait before giving up drain, zero means infinite + timeoutSeconds: 300 + # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period + gracePeriodSeconds: -1 + podDeletionPolicy: + # -- whether force deletion is allowed or not + force: true + # -- the length of time in seconds to wait before giving up on pod deletion, zero means infinite + timeoutSeconds: 300 + # -- the time kubernetes waits for a pod to shut down gracefully after receiving a termination signal, zero means immediate, minus value means follow pod defined grace period + gracePeriodSeconds: -1 + commonConfig: + # -- init container image + initContainerImage: busybox:1.36 + utilsContainer: + # -- gpu operator utility container image + image: docker.io/rocm/gpu-operator-utils:v1.4.0 + # -- utility container image pull policy + imagePullPolicy: IfNotPresent + # -- utility container image pull secret, e.g. {"name": "mySecretName"} + imageRegistrySecret: {} + devicePlugin: + # -- device plugin image + devicePluginImage: rocm/k8s-device-plugin:latest + # -- device plugin image pull policy + devicePluginImagePullPolicy: IfNotPresent + # -- device plugin tolerations + devicePluginTolerations: [] + # -- pass supported flags and their values while starting device plugin daemonset, e.g. {"resource_naming_strategy": "single"} or {"resource_naming_strategy": "mixed"} + devicePluginArguments: {} + # -- enable / disable node labeller + enableNodeLabeller: true + # -- node labeller image + nodeLabellerImage: rocm/k8s-device-plugin:labeller-latest + # -- node labeller image pull policy + nodeLabellerImagePullPolicy: IfNotPresent + # -- node labeller tolerations + nodeLabellerTolerations: [] + # -- pass supported labels while starting node labeller daemonset, default ["vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"], also support ["compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"] + nodeLabellerArguments: [] + # -- image pull secret for device plugin and node labeller, e.g. {"name": "mySecretName"} + imageRegistrySecret: {} + upgradePolicy: + # -- the type of daemonset upgrade, RollingUpdate or OnDelete + upgradeStrategy: RollingUpdate + # -- the maximum number of Pods that can be unavailable during the update process + maxUnavailable: 1 + metricsExporter: + # -- enable / disable device metrics exporter + enable: true + # -- type of service for exposing metrics endpoint, ClusterIP or NodePort + serviceType: ClusterIP + # -- internal port used for in-cluster and node access to pull metrics from the metrics-exporter (default 5000). + port: 5000 + # -- external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default) + nodePort: 32500 + # -- metrics exporter image + image: docker.io/rocm/device-metrics-exporter:v1.4.0 + # -- metrics exporter image pull policy + imagePullPolicy: "IfNotPresent" + # -- name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"} + config: {} + # -- metrics exporter tolerations + tolerations: [] + # -- metrics exporter image pull secret, e.g. {"name": "pullSecretName"} + imageRegistrySecret: {} + # -- metrics exporter node selector, if not specified it will reuse spec.selector + selector: {} + upgradePolicy: + # -- the type of daemonset upgrade, RollingUpdate or OnDelete + upgradeStrategy: RollingUpdate + # -- the maximum number of Pods that can be unavailable during the update process + maxUnavailable: 1 + rbacConfig: + # -- enable/disable kube rbac proxy + enable: false + # -- kube rbac proxy side car container image + image: quay.io/brancz/kube-rbac-proxy:v0.18.1 + # -- disable https protecting the proxy endpoint + disableHttps: false + # -- certificate secret to mount in kube-rbac container for TLS, self signed certificates will be generated by default, e.g. {"name": "secretName"} + secret: {} + # -- reference to a configmap containing the client CA (key: ca.crt) for mTLS client validation, e.g. {"name": "configMapName"} + clientCAConfigMap: {} + staticAuthorization: + # -- enables static authorization using client certificate CN + enable: false + # -- expected CN (Common Name) from client cert (e.g., Prometheus SA identity) + clientName: "" + prometheus: + serviceMonitor: + # -- enable or disable ServiceMonitor creation + enable: false + # -- frequency to scrape metrics. Accepts values with time unit suffix: "30s", "1m", "2h", "500ms" + interval: 30s + # -- define if Prometheus should attach node metadata to the target, e.g. {"node": "true"} + attachMetadata: {} + # -- choose the metric's labels on collisions with target labels + honorLabels: true + # -- control whether the scrape endpoints honor timestamps + honorTimestamps: false + # -- additional labels to add to the ServiceMonitor + labels: {} + # -- relabelConfigs to apply to samples before ingestion + relabelings: [] + # -- relabeling rules applied to individual scraped metrics + metricRelabelings: [] + # -- optional Prometheus authorization configuration for accessing the endpoint + authorization: {} + # -- TLS settings used by Prometheus to connect to the metrics endpoint + tlsConfig: {} + testRunner: + # -- enable / disable test runner + enable: false + # -- test runner image + image: docker.io/rocm/test-runner:v1.4.0 + # -- test runner image pull policy + imagePullPolicy: "IfNotPresent" + # -- test runner config map, e.g. {"name": "myConfigMap"} + config: {} + logsLocation: + # -- test runner internal mounted directory to save test run logs + mountPath: "/var/log/amd-test-runner" + # -- host directory to save test run logs + hostPath: "/var/log/amd-test-runner" + # -- a list of secrets that contain connectivity info to multiple cloud providers + logsExportSecrets: [] + upgradePolicy: + # -- the type of daemonset upgrade, RollingUpdate or OnDelete + upgradeStrategy: RollingUpdate + # -- the maximum number of Pods that can be unavailable during the update process + maxUnavailable: 1 + # -- test runner tolerations + tolerations: [] + # -- test runner image pull secret + imageRegistrySecret: {} + # -- test runner node selector, if not specified it will reuse spec.selector + selector: {} + configManager: + # -- enable/disable the config manager + enable: false + # -- config manager image + image: docker.io/rocm/device-config-manager:v1.4.0 + # -- image pull policy for config manager image + imagePullPolicy: IfNotPresent + # -- image pull secret for config manager image, e.g. {"name": "myPullSecret"} + imageRegistrySecret: {} + # -- config map for config manager, e.g. {"name": "myConfigMap"} + config: {} + # -- node selector for config manager, if not specified it will reuse spec.selector + selector: {} + upgradePolicy: + # -- the type of daemonset upgrade, RollingUpdate or OnDelete + upgradeStrategy: RollingUpdate + # -- the maximum number of Pods that can be unavailable during the update process + maxUnavailable: 1 + # -- config manager tolerations + configManagerTolerations: [] +# AMD GPU operator controller related configs +controllerManager: + manager: + args: + - --config=controller_manager_config.yaml + containerSecurityContext: + allowPrivilegeEscalation: false + image: + # -- AMD GPU operator controller manager image repository + repository: docker.io/rocm/gpu-operator + # -- AMD GPU operator controller manager image tag + tag: v1.4.0 + # -- Image pull policy for AMD GPU operator controller manager pod + imagePullPolicy: Always + # -- Image pull secret name for pulling AMD GPU operator controller manager image if registry needs credential to pull image + imagePullSecrets: "" + tolerations: + - key: "node-role.kubernetes.io/master" + operator: "Equal" + value: "" + effect: "NoSchedule" + - key: "node-role.kubernetes.io/control-plane" + operator: "Equal" + value: "" + effect: "NoSchedule" + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 100m + memory: 256Mi + # -- Node selector for AMD GPU operator controller manager deployment + nodeSelector: {} + # -- Deployment affinity configs for controller manager + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + replicas: 1 + serviceAccount: + annotations: {} + env: + simEnable: false +kmmDevicePlugin: + serviceAccount: + annotations: {} +kmmModuleLoader: + serviceAccount: + annotations: {} +kubernetesClusterDomain: cluster.local +managerConfig: + controllerManagerConfigYaml: |- + healthProbeBindAddress: :8081 + metricsBindAddress: 127.0.0.1:8080 + leaderElection: + enabled: true + resourceID: gpu.amd.com +metricsService: + ports: + - name: https + port: 8443 + protocol: TCP + targetPort: https + type: ClusterIP +nodeLabeller: + serviceAccount: + annotations: {} +metricsExporter: + serviceAccount: + annotations: {} +testRunner: + serviceAccount: + annotations: {} +configManager: + serviceAccount: + annotations: {} +utilsContainer: + serviceAccount: + annotations: {} +global: + proxy: + env: {}