Files
gpu-operator-charts/crds/deviceconfig-crd.yaml
2025-12-16 17:56:13 +11:00

1605 lines
86 KiB
YAML

---
# Source: gpu-operator-charts/templates/deviceconfig-crd.yaml
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: deviceconfigs.amd.com
annotations:
controller-gen.kubebuilder.io/version: v0.17.0
labels:
app.kubernetes.io/component: amd-gpu
app.kubernetes.io/part-of: amd-gpu
helm.sh/chart: gpu-operator-charts-v1.4.0
app.kubernetes.io/name: gpu-operator-charts
app.kubernetes.io/instance: amd-gpu
app.kubernetes.io/version: "v1.4.0"
app.kubernetes.io/managed-by: Helm
spec:
group: amd.com
names:
kind: DeviceConfig
listKind: DeviceConfigList
plural: deviceconfigs
shortNames:
- gpue
singular: deviceconfig
scope: Namespaced
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
description: DeviceConfig describes how to enable AMD GPU device
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
description: DeviceConfigSpec describes how the AMD GPU operator should
enable AMD GPU device for customer's use.
properties:
commonConfig:
description: common config
properties:
initContainerImage:
description: InitContainerImage is being used for the operands pods,
i.e. metrics exporter, test runner, device plugin, device config
manager and node labeller
type: string
utilsContainer:
description: UtilsContainer contains parameters to configure operator's
utils container
properties:
image:
description: Image is the image of utils container
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imagePullPolicy:
description: image pull policy for utils container
enum:
- Always
- IfNotPresent
- Never
type: string
imageRegistrySecret:
description: secret used for pull utils container image
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
type: object
type: object
configManager:
description: config manager
properties:
config:
description: config map to customize the config for config manager,
if not specified default config will be applied
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
configManagerTolerations:
description: tolerations for the device config manager DaemonSet
items:
description: |-
The pod this Toleration is attached to tolerates any taint that matches
the triple <key,value,effect> using the matching operator <operator>.
properties:
effect:
description: |-
Effect indicates the taint effect to match. Empty means match all taint effects.
When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: |-
Key is the taint key that the toleration applies to. Empty means match all taint keys.
If the key is empty, operator must be Exists; this combination means to match all values and all keys.
type: string
operator:
description: |-
Operator represents a key's relationship to the value.
Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod can
tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: |-
TolerationSeconds represents the period of time the toleration (which must be
of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
it is not set, which means tolerate the taint forever (do not evict). Zero and
negative values will be treated as 0 (evict immediately) by the system.
format: int64
type: integer
value:
description: |-
Value is the taint value the toleration matches to.
If the operator is Exists, the value should be empty, otherwise just a regular string.
type: string
type: object
type: array
enable:
description: enable config manager, disabled by default
type: boolean
image:
description: config manager image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imagePullPolicy:
description: image pull policy for config manager
enum:
- Always
- IfNotPresent
- Never
type: string
imageRegistrySecret:
description: config manager image registry secret used to pull/push
images
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
selector:
additionalProperties:
type: string
description: Selector describes on which nodes to enable config
manager
type: object
upgradePolicy:
description: upgrade policy for config manager daemonset
properties:
maxUnavailable:
default: 1
description: MaxUnavailable specifies the maximum number of
Pods that can be unavailable during the update process. Applicable
for RollingUpdate only. Default value is 1.
format: int32
type: integer
upgradeStrategy:
description: UpgradeStrategy specifies the type of the DaemonSet
update. Valid values are "RollingUpdate" (default) or "OnDelete".
enum:
- RollingUpdate
- OnDelete
type: string
type: object
type: object
devicePlugin:
description: device plugin
properties:
devicePluginArguments:
additionalProperties:
type: string
description: |-
device plugin arguments is used to pass supported flags and their values while starting device plugin daemonset
supported flag values: {"resource_naming_strategy": {"single", "mixed"}}
type: object
devicePluginImage:
description: device plugin image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
devicePluginImagePullPolicy:
description: image pull policy for device plugin
enum:
- Always
- IfNotPresent
- Never
type: string
devicePluginTolerations:
description: tolerations for the device plugin DaemonSet
items:
description: |-
The pod this Toleration is attached to tolerates any taint that matches
the triple <key,value,effect> using the matching operator <operator>.
properties:
effect:
description: |-
Effect indicates the taint effect to match. Empty means match all taint effects.
When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: |-
Key is the taint key that the toleration applies to. Empty means match all taint keys.
If the key is empty, operator must be Exists; this combination means to match all values and all keys.
type: string
operator:
description: |-
Operator represents a key's relationship to the value.
Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod can
tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: |-
TolerationSeconds represents the period of time the toleration (which must be
of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
it is not set, which means tolerate the taint forever (do not evict). Zero and
negative values will be treated as 0 (evict immediately) by the system.
format: int64
type: integer
value:
description: |-
Value is the taint value the toleration matches to.
If the operator is Exists, the value should be empty, otherwise just a regular string.
type: string
type: object
type: array
enableNodeLabeller:
default: true
description: enable or disable the node labeller
type: boolean
imageRegistrySecret:
description: node labeller image registry secret used to pull/push
images
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
nodeLabellerArguments:
description: |-
node labeller arguments is used to pass supported labels while starting node labeller daemonset
some flags are enabled by default as they are applicable and bare minimum for all setups and are supported in all versions of node labeller
default flags: {"vram", "cu-count", "simd-count", "device-id", "family", "product-name", "driver-version"}
supported flags: {"compute-memory-partition", "compute-partitioning-supported", "memory-partitioning-supported"}
items:
type: string
type: array
nodeLabellerImage:
description: node labeller image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
nodeLabellerImagePullPolicy:
description: image pull policy for node labeller
enum:
- Always
- IfNotPresent
- Never
type: string
nodeLabellerTolerations:
description: tolerations for the node labeller DaemonSet
items:
description: |-
The pod this Toleration is attached to tolerates any taint that matches
the triple <key,value,effect> using the matching operator <operator>.
properties:
effect:
description: |-
Effect indicates the taint effect to match. Empty means match all taint effects.
When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: |-
Key is the taint key that the toleration applies to. Empty means match all taint keys.
If the key is empty, operator must be Exists; this combination means to match all values and all keys.
type: string
operator:
description: |-
Operator represents a key's relationship to the value.
Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod can
tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: |-
TolerationSeconds represents the period of time the toleration (which must be
of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
it is not set, which means tolerate the taint forever (do not evict). Zero and
negative values will be treated as 0 (evict immediately) by the system.
format: int64
type: integer
value:
description: |-
Value is the taint value the toleration matches to.
If the operator is Exists, the value should be empty, otherwise just a regular string.
type: string
type: object
type: array
upgradePolicy:
description: upgrade policy for device plugin and node labeller
daemons
properties:
maxUnavailable:
default: 1
description: MaxUnavailable specifies the maximum number of
Pods that can be unavailable during the update process. Applicable
for RollingUpdate only. Default value is 1.
format: int32
type: integer
upgradeStrategy:
description: UpgradeStrategy specifies the type of the DaemonSet
update. Valid values are "RollingUpdate" (default) or "OnDelete".
enum:
- RollingUpdate
- OnDelete
type: string
type: object
type: object
driver:
description: driver
properties:
amdgpuInstallerRepoURL:
description: |-
radeon repo URL for fetching amdgpu installer if building driver image on the fly
installer URL is https://repo.radeon.com/amdgpu-install by default
type: string
blacklist:
description: |-
blacklist amdgpu drivers on the host. Node reboot is required to apply the baclklist on the worker nodes.
Not working for OpenShift cluster. OpenShift users please use the Machine Config Operator (MCO) resource to configure amdgpu blacklist.
Example MCO resource is available at https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/openshift-olm.html#create-blacklist-for-installing-out-of-tree-kernel-module
type: boolean
driverType:
default: container
description: |-
specify the type of driver (container/vf-passthrough/pf-passthrough) to install on the worker node. default value is container.
container: normal amdgpu-dkms driver for Bare Metal GPU nodes or guest VM.
vf-passthrough: MxGPU GIM driver on the host machine to generate VF, then mount VF to vfio-pci
pf-passthrough: directly mount PF device to vfio-pci
enum:
- container
- vf-passthrough
- pf-passthrough
type: string
enable:
default: true
description: |-
enable driver install. default value is true.
disable is for skipping driver install/uninstall for dryrun or using in-tree amdgpu kernel module
type: boolean
image:
description: |-
defines image that includes drivers and firmware blobs, don't include tag since it will be fully managed by operator
for vanilla k8s the default value is image-registry:5000/$MOD_NAMESPACE/amdgpu_kmod
for OpenShift the default value is image-registry.openshift-image-registry.svc:5000/$MOD_NAMESPACE/amdgpu_kmod
image tag will be in the format of <linux distro>-<release version>-<kernel version>-<driver version>
example tag is coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2 and ubuntu-22.04-5.15.0-94-generic-6.1.3
NOTE: Updating the driver image repository is not supported. Please delete the existing DeviceConfig and create a new one with the updated image repository
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[$a-zA-Z0-9_]+(?:[._-][$a-zA-Z0-9_]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imageBuild:
description: image build configs
properties:
baseImageRegistry:
default: docker.io
description: |-
image registry to fetch base image for building driver image, default value is docker.io, the builder will search for corresponding OS base image from given registry
e.g. if your worker node is using Ubuntu 22.04, by default the base image would be docker.io/ubuntu:22.04
NOTE: this field won't apply for OpenShift since OpenShift is using its own DriverToolKit image to build driver image
type: string
baseImageRegistryTLS:
description: TLS settings for fetching base image
properties:
insecure:
description: If true, check if the container image already
exists using plain HTTP.
type: boolean
insecureSkipTLSVerify:
description: If true, skip any TLS server certificate validation
type: boolean
type: object
type: object
imageRegistrySecret:
description: secrets used for pull/push images from/to private registry
specified in driversImage
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
imageRegistryTLS:
description: driver image registry TLS setting for the container
image
properties:
insecure:
description: If true, check if the container image already exists
using plain HTTP.
type: boolean
insecureSkipTLSVerify:
description: If true, skip any TLS server certificate validation
type: boolean
type: object
imageSign:
description: |-
image signing config to sign the driver image when building driver image on the fly
image signing is required for installing driver on secure boot enabled system
properties:
certSecret:
description: |-
ImageSignCertSecret the public key used to sign kernel modules within image
necessary for secure boot enabled system
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
keySecret:
description: |-
ImageSignKeySecret the private key used to sign kernel modules within image
necessary for secure boot enabled system
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
type: object
kernelModuleConfig:
description: advanced arguments, parameters and more configs to
manage tne driver
properties:
loadArgs:
description: LoadArg are the arguments when modprobe is executed
to load the kernel module. The command will be `modprobe ${Args}
module_name`.
items:
type: string
type: array
parameters:
description: Parameters is being used for modprobe commands.
The command will be `modprobe ${Args} module_name ${Parameters}`.
items:
type: string
type: array
unloadArgs:
description: UnloadArg are the arguments when modprobe is executed
to unload the kernel module. The command will be `modprobe
-r ${Args} module_name`.
items:
type: string
type: array
type: object
tolerations:
description: tolerations for kmm module object
items:
description: |-
The pod this Toleration is attached to tolerates any taint that matches
the triple <key,value,effect> using the matching operator <operator>.
properties:
effect:
description: |-
Effect indicates the taint effect to match. Empty means match all taint effects.
When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: |-
Key is the taint key that the toleration applies to. Empty means match all taint keys.
If the key is empty, operator must be Exists; this combination means to match all values and all keys.
type: string
operator:
description: |-
Operator represents a key's relationship to the value.
Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod can
tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: |-
TolerationSeconds represents the period of time the toleration (which must be
of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
it is not set, which means tolerate the taint forever (do not evict). Zero and
negative values will be treated as 0 (evict immediately) by the system.
format: int64
type: integer
value:
description: |-
Value is the taint value the toleration matches to.
If the operator is Exists, the value should be empty, otherwise just a regular string.
type: string
type: object
type: array
upgradePolicy:
description: policy to upgrade the drivers
properties:
enable:
description: |-
enable upgrade policy, disabled by default
If disabled, user has to manually upgrade all the nodes.
type: boolean
maxParallelUpgrades:
default: 1
description: |-
MaxParallelUpgrades indicates how many nodes can be upgraded in parallel
0 means no limit, all nodes will be upgraded in parallel
minimum: 0
type: integer
maxUnavailableNodes:
anyOf:
- type: integer
- type: string
default: 25%
description: |-
MaxUnavailableNodes indicates maximum number of nodes that can be in a failed upgrade state beyond which upgrades will stop to keep cluster at a minimal healthy state
Value can be an integer (ex: 2) which would mean atmost 2 nodes can be in failed state after which new upgrades will not start. Or it can be a percentage string(ex: "50%") from which absolute number will be calculated and round up
x-kubernetes-int-or-string: true
nodeDrainPolicy:
description: Node draining policy
properties:
force:
default: false
description: Force indicates if force draining is allowed
type: boolean
gracePeriodSeconds:
default: -1
description: GracePeriodSeconds indicates the time kubernetes
waits for a pod to shut down gracefully after receiving
a termination signal
type: integer
timeoutSeconds:
default: 300
description: TimeoutSecond specifies the length of time
in seconds to wait before giving up drain, zero means
infinite
minimum: 0
type: integer
type: object
podDeletionPolicy:
description: Pod Deletion policy. If both NodeDrainPolicy and
PodDeletionPolicy config is available, NodeDrainPolicy(if
enabled) will take precedence.
properties:
force:
default: false
description: Force indicates if force deletion is allowed
type: boolean
gracePeriodSeconds:
default: -1
description: GracePeriodSeconds indicates the time kubernetes
waits for a pod to shut down gracefully after receiving
a termination signal
type: integer
timeoutSeconds:
default: 300
description: TimeoutSecond specifies the length of time
in seconds to wait before giving up on pod deletion, zero
means infinite
minimum: 0
type: integer
type: object
rebootRequired:
default: true
description: reboot between driver upgrades, enabled by default,
if enabled spec.commonConfig.utilsContainer will be used to
perform reboot on worker nodes
type: boolean
type: object
version:
description: |-
version of the drivers source code, can be used as part of image of dockerfile source image
default value for different OS is: ubuntu: 6.1.3, coreOS: 6.2.2
type: string
vfioConfig:
description: |-
vfio config
specify the specific configs for binding PCI devices to vfio-pci kernel module, applies for driver type vf-passthrough and pf-passthrough
properties:
deviceIDs:
description: list of PCI device IDs to load into vfio-pci driver.
default is the list of AMD GPU PF/VF PCI device IDs based
on driver type vf-passthrough/pf-passthrough.
items:
type: string
type: array
type: object
type: object
metricsExporter:
description: metrics exporter
properties:
config:
description: optional configuration for metrics
properties:
name:
description: |-
Name of the configMap that defines the list of metrics
default list:[]
type: string
type: object
enable:
description: enable metrics exporter, disabled by default
type: boolean
image:
description: metrics exporter image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imagePullPolicy:
description: image pull policy for metrics exporter
enum:
- Always
- IfNotPresent
- Never
type: string
imageRegistrySecret:
description: metrics exporter image registry secret used to pull/push
images
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
nodePort:
description: NodePort is the external port for pulling metrics from
outside the cluster, in the range 30000-32767 (assigned automatically
by default)
format: int32
maximum: 32767
minimum: 30000
type: integer
podResourceAPISocketPath:
default: /var/lib/kubelet/pod-resources
description: |-
Set the host path for pod-resource kubelet.socket,
vanila kubernetes path is /var/lib/kubelet/pod-resources
microk8s path is /var/snap/microk8s/common/var/lib/kubelet/pod-resources/
path is an absolute unix path that allows a trailing slash
pattern: ^(/[^/\0]+)*(/)?$
type: string
port:
default: 5000
description: Port is the internal port used for in-cluster and node
access to pull metrics from the metrics-exporter (default 5000).
format: int32
type: integer
prometheus:
description: Prometheus configuration for metrics exporter
properties:
serviceMonitor:
description: ServiceMonitor configuration for Prometheus integration
properties:
attachMetadata:
description: AttachMetadata defines if Prometheus should
attach node metadata to the target
properties:
node:
description: |-
When set to true, Prometheus attaches node metadata to the discovered
targets.
The Prometheus service account must have the `list` and `watch`
permissions on the `Nodes` objects.
type: boolean
type: object
authorization:
description: Optional Prometheus authorization configuration
for accessing the endpoint
properties:
credentials:
description: Selects a key of a Secret in the namespace
that contains the credentials for authentication.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type:
description: |-
Defines the authentication type. The value is case-insensitive.
"Basic" is not a supported value.
Default: "Bearer"
type: string
type: object
bearerTokenFile:
description: |-
Path to bearer token file to be used by Prometheus (e.g., service account token path)
Deprecated: Use Authorization instead. This field is kept for backward compatibility.
type: string
enable:
description: Enable or disable ServiceMonitor creation (default
false)
type: boolean
honorLabels:
default: true
description: HonorLabels chooses the metric's labels on
collisions with target labels (default true)
type: boolean
honorTimestamps:
description: HonorTimestamps controls whether the scrape
endpoints honor timestamps (default false)
type: boolean
interval:
description: 'How frequently to scrape metrics. Accepts
values with time unit suffix: "30s", "1m", "2h", "500ms"'
pattern: ^([0-9]+)(ms|s|m|h)$
type: string
labels:
additionalProperties:
type: string
description: 'Additional labels to add to the ServiceMonitor
(default release: prometheus)'
type: object
metricRelabelings:
description: Relabeling rules applied to individual scraped
metrics
items:
description: |-
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
scraped samples and remote write samples.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
properties:
action:
default: replace
description: |-
Action to perform based on the regex matching.
`Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0.
`DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0.
Default: "Replace"
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: |-
Modulus to take of the hash of the source label values.
Only applicable when the action is `HashMod`.
format: int64
type: integer
regex:
description: Regular expression against which the
extracted value is matched.
type: string
replacement:
description: |-
Replacement value against which a Replace action is performed if the
regular expression matches.
Regex capture groups are available.
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: |-
The source labels select values from existing labels. Their content is
concatenated using the configured Separator and matched against the
configured regular expression.
items:
description: |-
LabelName is a valid Prometheus label name which may only contain ASCII
letters, numbers, as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: |-
Label to which the resulting string is written in a replacement.
It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,
`KeepEqual` and `DropEqual` actions.
Regex capture groups are available.
type: string
type: object
type: array
relabelings:
description: RelabelConfigs to apply to samples before ingestion
items:
description: |-
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
scraped samples and remote write samples.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
properties:
action:
default: replace
description: |-
Action to perform based on the regex matching.
`Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0.
`DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0.
Default: "Replace"
enum:
- replace
- Replace
- keep
- Keep
- drop
- Drop
- hashmod
- HashMod
- labelmap
- LabelMap
- labeldrop
- LabelDrop
- labelkeep
- LabelKeep
- lowercase
- Lowercase
- uppercase
- Uppercase
- keepequal
- KeepEqual
- dropequal
- DropEqual
type: string
modulus:
description: |-
Modulus to take of the hash of the source label values.
Only applicable when the action is `HashMod`.
format: int64
type: integer
regex:
description: Regular expression against which the
extracted value is matched.
type: string
replacement:
description: |-
Replacement value against which a Replace action is performed if the
regular expression matches.
Regex capture groups are available.
type: string
separator:
description: Separator is the string between concatenated
SourceLabels.
type: string
sourceLabels:
description: |-
The source labels select values from existing labels. Their content is
concatenated using the configured Separator and matched against the
configured regular expression.
items:
description: |-
LabelName is a valid Prometheus label name which may only contain ASCII
letters, numbers, as well as underscores.
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
type: string
type: array
targetLabel:
description: |-
Label to which the resulting string is written in a replacement.
It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,
`KeepEqual` and `DropEqual` actions.
Regex capture groups are available.
type: string
type: object
type: array
tlsConfig:
description: TLS settings used by Prometheus to connect
to the metrics endpoint
properties:
ca:
description: Certificate authority used when verifying
server certificates.
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select
from. Must be a valid secret key.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
caFile:
description: Path to the CA cert in the Prometheus container
to use for the targets.
type: string
cert:
description: Client certificate to present when doing
client-authentication.
properties:
configMap:
description: ConfigMap containing data to use for
the targets.
properties:
key:
description: The key to select.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the ConfigMap or
its key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
secret:
description: Secret containing data to use for the
targets.
properties:
key:
description: The key of the secret to select
from. Must be a valid secret key.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the Secret or its
key must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
type: object
certFile:
description: Path to the client cert file in the Prometheus
container for the targets.
type: string
insecureSkipVerify:
description: Disable target certificate validation.
type: boolean
keyFile:
description: Path to the client key file in the Prometheus
container for the targets.
type: string
keySecret:
description: Secret containing the client key file for
the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
x-kubernetes-map-type: atomic
maxVersion:
description: |-
Maximum acceptable TLS version.
It requires Prometheus >= v2.41.0.
enum:
- TLS10
- TLS11
- TLS12
- TLS13
type: string
minVersion:
description: |-
Minimum acceptable TLS version.
It requires Prometheus >= v2.35.0.
enum:
- TLS10
- TLS11
- TLS12
- TLS13
type: string
serverName:
description: Used to verify the hostname for the targets.
type: string
type: object
type: object
type: object
rbacConfig:
description: optional kube-rbac-proxy config to provide rbac services
properties:
clientCAConfigMap:
description: 'Reference to a configmap containing the client
CA (key: ca.crt) for mTLS client validation'
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
disableHttps:
description: disable https protecting the proxy endpoint
type: boolean
enable:
description: enable kube-rbac-proxy, disabled by default
type: boolean
image:
description: kube-rbac-proxy image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
secret:
description: certificate secret to mount in kube-rbac container
for TLS, self signed certificates will be generated by default
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
staticAuthorization:
description: Optional static RBAC rules based on client certificate
Common Name (CN)
properties:
clientName:
description: Expected CN (Common Name) from client cert
(e.g., Prometheus SA identity)
type: string
enable:
description: Enables static authorization using client certificate
CN
type: boolean
type: object
type: object
selector:
additionalProperties:
type: string
description: Selector describes on which nodes to enable metrics
exporter
type: object
serviceType:
default: ClusterIP
description: ServiceType service type for metrics, clusterIP/NodePort,
clusterIP by default
enum:
- ClusterIP
- NodePort
type: string
tolerations:
description: tolerations for metrics exporter
items:
description: |-
The pod this Toleration is attached to tolerates any taint that matches
the triple <key,value,effect> using the matching operator <operator>.
properties:
effect:
description: |-
Effect indicates the taint effect to match. Empty means match all taint effects.
When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: |-
Key is the taint key that the toleration applies to. Empty means match all taint keys.
If the key is empty, operator must be Exists; this combination means to match all values and all keys.
type: string
operator:
description: |-
Operator represents a key's relationship to the value.
Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod can
tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: |-
TolerationSeconds represents the period of time the toleration (which must be
of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
it is not set, which means tolerate the taint forever (do not evict). Zero and
negative values will be treated as 0 (evict immediately) by the system.
format: int64
type: integer
value:
description: |-
Value is the taint value the toleration matches to.
If the operator is Exists, the value should be empty, otherwise just a regular string.
type: string
type: object
type: array
upgradePolicy:
description: upgrade policy for metrics exporter daemons
properties:
maxUnavailable:
default: 1
description: MaxUnavailable specifies the maximum number of
Pods that can be unavailable during the update process. Applicable
for RollingUpdate only. Default value is 1.
format: int32
type: integer
upgradeStrategy:
description: UpgradeStrategy specifies the type of the DaemonSet
update. Valid values are "RollingUpdate" (default) or "OnDelete".
enum:
- RollingUpdate
- OnDelete
type: string
type: object
type: object
selector:
additionalProperties:
type: string
description: Selector describes on which nodes the GPU Operator should
enable the GPU device.
type: object
testRunner:
description: test runner
properties:
config:
description: config map to customize the config for test runner,
if not specified default test config will be aplied
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
enable:
description: enable test runner, disabled by default
type: boolean
image:
description: test runner image
pattern: ^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$
type: string
imagePullPolicy:
description: image pull policy for test runner
enum:
- Always
- IfNotPresent
- Never
type: string
imageRegistrySecret:
description: test runner image registry secret used to pull/push
images
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
logsLocation:
description: captures logs location and export config for test runner
logs
properties:
hostPath:
default: /var/log/amd-test-runner
description: host path to store test runner internal status
db in order to persist test running status
type: string
logsExportSecrets:
description: LogsExportSecrets is a list of secrets that contain
connectivity info to multiple cloud providers
items:
description: |-
LocalObjectReference contains enough information to let you locate the
referenced object inside the same namespace.
properties:
name:
default: ""
description: |-
Name of the referent.
This field is effectively required, but due to backwards compatibility is
allowed to be empty. Instances of this type with an empty value here are
almost certainly wrong.
More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
type: string
type: object
x-kubernetes-map-type: atomic
type: array
mountPath:
default: /var/log/amd-test-runner
description: volume mount destination within test runner container
type: string
type: object
selector:
additionalProperties:
type: string
description: Selector describes on which nodes to enable test runner
type: object
tolerations:
description: tolerations for test runner
items:
description: |-
The pod this Toleration is attached to tolerates any taint that matches
the triple <key,value,effect> using the matching operator <operator>.
properties:
effect:
description: |-
Effect indicates the taint effect to match. Empty means match all taint effects.
When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute.
type: string
key:
description: |-
Key is the taint key that the toleration applies to. Empty means match all taint keys.
If the key is empty, operator must be Exists; this combination means to match all values and all keys.
type: string
operator:
description: |-
Operator represents a key's relationship to the value.
Valid operators are Exists and Equal. Defaults to Equal.
Exists is equivalent to wildcard for value, so that a pod can
tolerate all taints of a particular category.
type: string
tolerationSeconds:
description: |-
TolerationSeconds represents the period of time the toleration (which must be
of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default,
it is not set, which means tolerate the taint forever (do not evict). Zero and
negative values will be treated as 0 (evict immediately) by the system.
format: int64
type: integer
value:
description: |-
Value is the taint value the toleration matches to.
If the operator is Exists, the value should be empty, otherwise just a regular string.
type: string
type: object
type: array
upgradePolicy:
description: upgrade policy for test runner daemonset
properties:
maxUnavailable:
default: 1
description: MaxUnavailable specifies the maximum number of
Pods that can be unavailable during the update process. Applicable
for RollingUpdate only. Default value is 1.
format: int32
type: integer
upgradeStrategy:
description: UpgradeStrategy specifies the type of the DaemonSet
update. Valid values are "RollingUpdate" (default) or "OnDelete".
enum:
- RollingUpdate
- OnDelete
type: string
type: object
type: object
type: object
status:
description: DeviceConfigStatus defines the observed state of Module.
properties:
conditions:
description: Conditions list the current status of the DeviceConfig
object
items:
description: Condition contains details for one aspect of the current
state of this API Resource.
properties:
lastTransitionTime:
description: |-
lastTransitionTime is the last time the condition transitioned from one status to another.
This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable.
format: date-time
type: string
message:
description: |-
message is a human readable message indicating details about the transition.
This may be an empty string.
maxLength: 32768
type: string
observedGeneration:
description: |-
observedGeneration represents the .metadata.generation that the condition was set based upon.
For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
with respect to the current state of the instance.
format: int64
minimum: 0
type: integer
reason:
description: |-
reason contains a programmatic identifier indicating the reason for the condition's last transition.
Producers of specific condition types may define expected values and meanings for this field,
and whether the values are considered a guaranteed API.
The value should be a CamelCase string.
This field may not be empty.
maxLength: 1024
minLength: 1
pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
type: string
status:
description: status of the condition, one of True, False, Unknown.
enum:
- "True"
- "False"
- Unknown
type: string
type:
description: type of condition in CamelCase or in foo.example.com/CamelCase.
maxLength: 316
pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
type: string
required:
- lastTransitionTime
- message
- reason
- status
- type
type: object
type: array
configManager:
description: ConfigManager contains the status of the ConfigManager
deployment
properties:
availableNumber:
description: number of the actually deployed and running pods
format: int32
type: integer
desiredNumber:
description: number of the pods that should be deployed for daemonset
format: int32
type: integer
nodesMatchingSelectorNumber:
description: number of nodes that are targeted by the DeviceConfig
selector
format: int32
type: integer
type: object
devicePlugin:
description: DevicePlugin contains the status of the Device Plugin deployment
properties:
availableNumber:
description: number of the actually deployed and running pods
format: int32
type: integer
desiredNumber:
description: number of the pods that should be deployed for daemonset
format: int32
type: integer
nodesMatchingSelectorNumber:
description: number of nodes that are targeted by the DeviceConfig
selector
format: int32
type: integer
type: object
driver:
description: Driver contains the status of the Drivers deployment
properties:
availableNumber:
description: number of the actually deployed and running pods
format: int32
type: integer
desiredNumber:
description: number of the pods that should be deployed for daemonset
format: int32
type: integer
nodesMatchingSelectorNumber:
description: number of nodes that are targeted by the DeviceConfig
selector
format: int32
type: integer
type: object
metricsExporter:
description: MetricsExporter contains the status of the MetricsExporter
deployment
properties:
availableNumber:
description: number of the actually deployed and running pods
format: int32
type: integer
desiredNumber:
description: number of the pods that should be deployed for daemonset
format: int32
type: integer
nodesMatchingSelectorNumber:
description: number of nodes that are targeted by the DeviceConfig
selector
format: int32
type: integer
type: object
nodeModuleStatus:
additionalProperties:
description: ModuleStatus contains the status of driver module installed
by operator on the node
properties:
bootId:
type: string
containerImage:
type: string
kernelVersion:
type: string
lastTransitionTime:
type: string
status:
description: UpgradeState captures the state of the upgrade process
on a node
type: string
upgradeStartTime:
type: string
type: object
description: NodeModuleStatus contains per node status of driver module
installation
type: object
observedGeneration:
description: ObservedGeneration is the latest spec generation successfully
processed by the controller
format: int64
type: integer
type: object
type: object
served: true
storage: true
subresources:
status: {}
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []