Compare commits

...

3 Commits

Author SHA1 Message Date
ysh
370edc473c Merge pull request 'dev1' (#17) from dev1 into main
Reviewed-on: #17
2025-07-18 18:15:53 +08:00
ysh
58383646e2 增强友好性 2025-07-18 18:15:18 +08:00
ysh
43ada048c4 增强友好性 2025-07-18 17:23:26 +08:00
6 changed files with 17 additions and 6 deletions

View File

@ -296,6 +296,7 @@ def create_or_update_statefulset(apps_v1, v1, namespace_name, statefulset_name,
"kind": "StatefulSet",
"metadata": {"name": statefulset_name, "namespace": namespace_name},
"spec": {
"runtimeClassName": "nvidia", # 指定使用 nvidia runtime"
"replicas": replicas,
"selector": {"matchLabels": source_selflabel},
"serviceName": statefulset_name,

View File

@ -207,6 +207,7 @@ async def new_cluster_install(params):
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
"files/config.toml": "/opt/config.toml",
"files/runtimeclass-nvidia.yaml": "/opt/runtimeclass-nvidia.yaml",
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",

View File

@ -83,7 +83,7 @@ version = 2
setup_serially = false
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"
default_runtime_name = "runc"
disable_snapshot_annotations = true
discard_unpacked_layers = false
ignore_blockio_not_enabled_errors = false

View File

@ -28,13 +28,14 @@ spec:
labels:
name: nvidia-device-plugin-ds
spec:
runtimeClassName: nvidia # 指定使用 nvidia runtime
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0
- image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1
name: nvidia-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false

View File

@ -0,0 +1,6 @@
# runtimeclass-nvidia.yaml
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia # 与 Pod 中引用的名称一致
handler: nvidia # 对应 containerd 配置中的 runtime 名称

View File

@ -540,14 +540,16 @@ if [ "$1" == "master" ]; then
# 安装网络插件
log_info "正在安装网络插件(flannel)"
kubectl apply -f /opt/kube-flannel.yml || log_error "本地安装flannel网络插件失败"
log_info "正在安装MetricsServer插件"
kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
# log_info "正在安装Ingress-nginx-controller插件"
# kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
# log_info "正在安装GPU模式必要插件"
# kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
log_info "正在安装nfs-client-provisioner插件"
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
log_info "正在安装MetricsServer插件"
kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
log_info "正在安装GPU模式必要插件runtimeclass-nvidia.yaml"
kubectl apply -f /opt/runtimeclass-nvidia.yaml || log_error "本地安装GPU模式必要插件runtimeclass-nvidia.yaml失败"
log_info "正在安装GPU模式必要插件nvidia-device-plugin.yml"
kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
if [ $? -ne 0 ]; then
echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
exit 1