增强友好性
This commit is contained in:
parent
43ada048c4
commit
58383646e2
@ -296,6 +296,7 @@ def create_or_update_statefulset(apps_v1, v1, namespace_name, statefulset_name,
|
|||||||
"kind": "StatefulSet",
|
"kind": "StatefulSet",
|
||||||
"metadata": {"name": statefulset_name, "namespace": namespace_name},
|
"metadata": {"name": statefulset_name, "namespace": namespace_name},
|
||||||
"spec": {
|
"spec": {
|
||||||
|
"runtimeClassName": "nvidia", # 指定使用 nvidia runtime"
|
||||||
"replicas": replicas,
|
"replicas": replicas,
|
||||||
"selector": {"matchLabels": source_selflabel},
|
"selector": {"matchLabels": source_selflabel},
|
||||||
"serviceName": statefulset_name,
|
"serviceName": statefulset_name,
|
||||||
|
|||||||
@ -207,6 +207,7 @@ async def new_cluster_install(params):
|
|||||||
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
||||||
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
||||||
"files/config.toml": "/opt/config.toml",
|
"files/config.toml": "/opt/config.toml",
|
||||||
|
"files/runtimeclass-nvidia.yaml": "/opt/runtimeclass-nvidia.yaml",
|
||||||
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
||||||
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
|
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
|
||||||
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
|
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
|
||||||
|
|||||||
@ -83,7 +83,7 @@ version = 2
|
|||||||
setup_serially = false
|
setup_serially = false
|
||||||
|
|
||||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||||
default_runtime_name = "nvidia"
|
default_runtime_name = "runc"
|
||||||
disable_snapshot_annotations = true
|
disable_snapshot_annotations = true
|
||||||
discard_unpacked_layers = false
|
discard_unpacked_layers = false
|
||||||
ignore_blockio_not_enabled_errors = false
|
ignore_blockio_not_enabled_errors = false
|
||||||
|
|||||||
@ -28,13 +28,14 @@ spec:
|
|||||||
labels:
|
labels:
|
||||||
name: nvidia-device-plugin-ds
|
name: nvidia-device-plugin-ds
|
||||||
spec:
|
spec:
|
||||||
|
runtimeClassName: nvidia # 指定使用 nvidia runtime
|
||||||
tolerations:
|
tolerations:
|
||||||
- key: nvidia.com/gpu
|
- key: nvidia.com/gpu
|
||||||
operator: Exists
|
operator: Exists
|
||||||
effect: NoSchedule
|
effect: NoSchedule
|
||||||
priorityClassName: "system-node-critical"
|
priorityClassName: "system-node-critical"
|
||||||
containers:
|
containers:
|
||||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0
|
- image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1
|
||||||
name: nvidia-device-plugin-ctr
|
name: nvidia-device-plugin-ctr
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
|
|||||||
6
files/runtimeclass-nvidia.yaml
Normal file
6
files/runtimeclass-nvidia.yaml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# runtimeclass-nvidia.yaml
|
||||||
|
apiVersion: node.k8s.io/v1
|
||||||
|
kind: RuntimeClass
|
||||||
|
metadata:
|
||||||
|
name: nvidia # 与 Pod 中引用的名称一致
|
||||||
|
handler: nvidia # 对应 containerd 配置中的 runtime 名称
|
||||||
@ -544,10 +544,12 @@ if [ "$1" == "master" ]; then
|
|||||||
# kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
|
# kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
|
||||||
log_info "正在安装nfs-client-provisioner插件"
|
log_info "正在安装nfs-client-provisioner插件"
|
||||||
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
|
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
|
||||||
# log_info "正在安装MetricsServer插件"
|
log_info "正在安装MetricsServer插件"
|
||||||
# kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
|
kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
|
||||||
# log_info "正在安装GPU模式必要插件"
|
log_info "正在安装GPU模式必要插件runtimeclass-nvidia.yaml"
|
||||||
# kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
|
kubectl apply -f /opt/runtimeclass-nvidia.yaml || log_error "本地安装GPU模式必要插件runtimeclass-nvidia.yaml失败"
|
||||||
|
log_info "正在安装GPU模式必要插件nvidia-device-plugin.yml"
|
||||||
|
kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
|
||||||
if [ $? -ne 0 ]; then
|
if [ $? -ne 0 ]; then
|
||||||
echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
|
echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
|
||||||
exit 1
|
exit 1
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user