bugfix

Merge pull request 'GPU正式可用版本' (#19 ) from dev1 into main
Reviewed-on: #19
2025-11-19 18:02:22 +08:00 · 2025-07-18 21:32:16 +08:00 · 2025-07-18 21:24:26 +08:00 · 2025-07-18 18:15:53 +08:00 · 2025-07-18 17:14:52 +08:00 · 2025-07-18 15:06:12 +08:00
2 changed files with 348 additions and 0 deletions
--- a/pcapi/cluster_resource_monitor_with_pid_mapping.py
+++ b/pcapi/cluster_resource_monitor_with_pid_mapping.py
@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import json
+import csv
+from kubernetes import client, config
+import pynvml
+import subprocess
+
+# ----------------------------
+# 1. 初始化 Kubernetes
+# ----------------------------
+try:
+    kubeconfig_path = os.environ.get("KUBECONFIG", "/root/.kube/config")
+    config.load_kube_config(config_file=kubeconfig_path)
+except Exception as e:
+    print(f"Error loading kubeconfig: {e}")
+    sys.exit(1)
+
+v1 = client.CoreV1Api()
+crd_api = client.CustomObjectsApi()
+
+# ----------------------------
+# 2. 初始化 NVIDIA
+# ----------------------------
+try:
+    pynvml.nvmlInit()
+    gpu_count = pynvml.nvmlDeviceGetCount()
+except Exception as e:
+    print(f"No NVIDIA GPU found or driver not installed: {e}")
+    gpu_count = 0
+
+# ----------------------------
+# 3. 节点资源
+# ----------------------------
+nodes_info = []
+gpu_nodes = {}
+for node in v1.list_node().items:
+    name = node.metadata.name
+    status = next((cond.type for cond in node.status.conditions if cond.status=="True"), "Unknown")
+    cpu = node.status.allocatable.get('cpu', '0')
+    memory = node.status.allocatable.get('memory', '0')
+    gpu = node.status.allocatable.get('nvidia.com/gpu', '0')
+    nodes_info.append({'name': name,'status': status,'cpu_allocatable': cpu,'memory_allocatable': memory,'gpu_allocatable': gpu})
+    if gpu != '0':
+        gpu_nodes[name] = int(gpu)
+
+# ----------------------------
+# 4. Pod资源
+# ----------------------------
+pods_info = []
+pod_pid_map = {}  # key=PID, value=(namespace,pod_name)
+for pod in v1.list_pod_for_all_namespaces().items:
+    pod_name = pod.metadata.name
+    namespace = pod.metadata.namespace
+    node_name = pod.spec.node_name
+    for c in pod.spec.containers:
+        cpu_req = c.resources.requests.get('cpu', '0') if c.resources.requests else '0'
+        mem_req = c.resources.requests.get('memory', '0') if c.resources.requests else '0'
+        gpu_req = c.resources.requests.get('nvidia.com/gpu', '0') if c.resources.requests else '0'
+        pods_info.append({
+            'namespace': namespace,
+            'pod_name': pod_name,
+            'node': node_name,
+            'container': c.name,
+            'cpu_request': cpu_req,
+            'memory_request': mem_req,
+            'gpu_request': gpu_req
+        })
+    # 获取容器 PID 映射 (离线环境需有 nsenter 或 crictl)
+    try:
+        cmd = f"crictl inspectp $(crictl ps --name {pod_name} -q)"
+        out = subprocess.check_output(cmd, shell=True).decode()
+        # 简化处理，只提取 PID
+        import re
+        pids = re.findall(r'"pid":\s*(\d+)', out)
+        for pid in pids:
+            pod_pid_map[int(pid)] = (namespace, pod_name)
+    except Exception:
+        continue
+
+# ----------------------------
+# 5. KubeVirt VM资源
+# ----------------------------
+vms_info = []
+vm_pid_map = {}  # key=PID, value=(namespace,vm_name)
+namespaces = [ns.metadata.name for ns in v1.list_namespace().items]
+for ns in namespaces:
+    try:
+        vms = crd_api.list_namespaced_custom_object(
+            group="kubevirt.io",
+            version="v1",
+            namespace=ns,
+            plural="virtualmachines"
+        )
+        for vm in vms.get('items', []):
+            name = vm['metadata']['name']
+            spec = vm.get('spec', {}).get('template', {}).get('spec', {})
+            domain = spec.get('domain', {})
+            resources = domain.get('resources', {}).get('requests', {})
+            cpu = resources.get('cpu', '0')
+            memory = resources.get('memory', '0')
+            gpus = domain.get('devices', {}).get('gpus', [])
+            vms_info.append({
+                'namespace': ns,
+                'vm_name': name,
+                'cpu_request': cpu,
+                'memory_request': memory,
+                'gpus': gpus
+            })
+            # 获取 virt-launcher Pod PID
+            try:
+                vl_pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={name}")
+                for vl_pod in vl_pods.items:
+                    cmd = f"crictl inspectp $(crictl ps --name {vl_pod.metadata.name} -q)"
+                    out = subprocess.check_output(cmd, shell=True).decode()
+                    import re
+                    pids = re.findall(r'"pid":\s*(\d+)', out)
+                    for pid in pids:
+                        vm_pid_map[int(pid)] = (ns, name)
+            except Exception:
+                continue
+    except client.exceptions.ApiException:
+        continue
+
+# ----------------------------
+# 6. GPU实时使用 + PID映射
+# ----------------------------
+gpu_info = []
+if gpu_count > 0:
+    for i in range(gpu_count):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+        name = pynvml.nvmlDeviceGetName(handle).decode()
+        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+
+        processes = []
+        try:
+            for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
+                pid = proc.pid
+                used_mem = proc.usedGpuMemory // 1024**2
+                owner = pod_pid_map.get(pid) or vm_pid_map.get(pid) or ("unknown", "unknown")
+                processes.append({
+                    'pid': pid,
+                    'used_memory_MB': used_mem,
+                    'owner_type': 'pod' if pid in pod_pid_map else 'vm' if pid in vm_pid_map else 'unknown',
+                    'namespace': owner[0],
+                    'name': owner[1]
+                })
+        except pynvml.NVMLError:
+            pass
+
+        gpu_info.append({
+            'gpu_index': i,
+            'name': name,
+            'memory_total_MB': mem_info.total // 1024**2,
+            'memory_used_MB': mem_info.used // 1024**2,
+            'gpu_util_percent': util.gpu,
+            'mem_util_percent': util.memory,
+            'processes': processes
+        })
+
+# ----------------------------
+# 7. 输出 JSON 和 CSV
+# ----------------------------
+output = {'nodes': nodes_info, 'pods': pods_info, 'vms': vms_info, 'gpus': gpu_info}
+with open("cluster_resources.json", "w") as f:
+    json.dump(output, f, indent=2)
+
+# CSV输出函数
+def write_csv(filename, data, fieldnames):
+    with open(filename, "w", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(data)
+
+write_csv("nodes.csv", nodes_info, ['name','status','cpu_allocatable','memory_allocatable','gpu_allocatable'])
+write_csv("pods.csv", pods_info, ['namespace','pod_name','node','container','cpu_request','memory_request','gpu_request'])
+write_csv("vms.csv", vms_info, ['namespace','vm_name','cpu_request','memory_request','gpus'])
+write_csv("gpus.csv", gpu_info, ['gpu_index','name','memory_total_MB','memory_used_MB','gpu_util_percent','mem_util_percent','processes'])
+
+print("Cluster resource report generated with GPU PID → Pod/VM mapping")
+print(" - cluster_resources.json")
+print(" - nodes.csv, pods.csv, vms.csv, gpus.csv")
+
--- a/pcapi/vm_buckup_restore.py
+++ b/pcapi/vm_buckup_restore.py
@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+备份前建议关机或确保数据一致性
+恢复前 VM 应该处于关机状态
+只处理 disk0，多磁盘可扩展循环处理
+GPU VM 也适用，恢复 PVC 后可重新分配 GPU
+"""
+import os
+import sys
+import argparse
+import subprocess
+import yaml
+from kubernetes import client, config
+
+# ----------------------------
+# 参数解析
+# ----------------------------
+parser = argparse.ArgumentParser(description="KubeVirt VM Backup and Restore (NFS, Multi-disk, GPU)")
+parser.add_argument("action", choices=["backup","restore"], help="操作类型")
+parser.add_argument("--vm", required=True, help="VM 名称")
+parser.add_argument("--namespace", default="default", help="VM 所在 Namespace")
+parser.add_argument("--nfs-path", required=True, help="共享 NFS 路径，例如 /mnt/nfs-backup")
+parser.add_argument("--restore-vm", help="恢复时新 VM 名称，如果不填覆盖原 VM")
+args = parser.parse_args()
+
+vm_name = args.vm
+namespace = args.namespace
+nfs_path = args.nfs_path
+restore_vm_name = args.restore_vm or vm_name
+
+# ----------------------------
+# 初始化 K8s API
+# ----------------------------
+try:
+    kubeconfig_path = os.environ.get("KUBECONFIG","/root/.kube/config")
+    config.load_kube_config(config_file=kubeconfig_path)
+except Exception as e:
+    print(f"加载 kubeconfig 失败: {e}")
+    sys.exit(1)
+
+v1 = client.CoreV1Api()
+crd_api = client.CustomObjectsApi()
+
+# ----------------------------
+# 获取 virt-launcher Pod
+# ----------------------------
+def get_virt_launcher_pod(vm, ns):
+    pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={vm}")
+    if not pods.items:
+        print(f"未找到 VM {vm} 的 virt-launcher Pod")
+        sys.exit(1)
+    return pods.items[0].metadata.name
+
+# ----------------------------
+# 获取 VM spec（包括 nodeSelector, GPU 配置）
+# ----------------------------
+def get_vm_spec(vm, ns):
+    vm_obj = crd_api.get_namespaced_custom_object(
+        group="kubevirt.io",
+        version="v1",
+        namespace=ns,
+        plural="virtualmachines",
+        name=vm
+    )
+    return vm_obj.get("spec", {}).get("template", {}).get("spec", {})
+
+# ----------------------------
+# 备份 VM
+# ----------------------------
+def backup_vm():
+    pod_name = get_virt_launcher_pod(vm_name, namespace)
+    backup_dir = os.path.join(nfs_path, vm_name)
+    os.makedirs(backup_dir, exist_ok=True)
+
+    # 获取磁盘路径
+    cmd_list_disks = f"kubectl exec -n {namespace} {pod_name} -- ls /var/run/kubevirt-private/vmi-disks/"
+    disks = subprocess.check_output(cmd_list_disks, shell=True).decode().splitlines()
+
+    for disk in disks:
+        src_path = f"/var/run/kubevirt-private/vmi-disks/{disk}"
+        dest_path = os.path.join(backup_dir, f"{disk}.qcow2")
+        print(f"[备份] {disk} -> {dest_path}")
+        # 转换成 QCOW2 临时文件
+        cmd_convert = f"kubectl exec -n {namespace} {pod_name} -- qemu-img convert -O qcow2 {src_path} /tmp/{disk}.qcow2"
+        subprocess.run(cmd_convert, shell=True, check=True)
+        # 拷贝到 NFS
+        cmd_cp = f"kubectl cp {namespace}/{pod_name}:/tmp/{disk}.qcow2 {dest_path}"
+        subprocess.run(cmd_cp, shell=True, check=True)
+        # 清理临时文件
+        subprocess.run(f"kubectl exec -n {namespace} {pod_name} -- rm /tmp/{disk}.qcow2", shell=True)
+    print(f"VM {vm_name} 备份完成，存放在 {backup_dir}")
+
+# ----------------------------
+# 恢复 VM
+# ----------------------------
+def restore_vm():
+    backup_dir = os.path.join(nfs_path, vm_name)
+    if not os.path.exists(backup_dir):
+        print(f"备份目录 {backup_dir} 不存在")
+        sys.exit(1)
+
+    # 获取原 VM spec
+    original_spec = get_vm_spec(vm_name, namespace)
+
+    # 删除原 VM（如果覆盖）
+    if restore_vm_name == vm_name:
+        print(f"[恢复] 删除原 VM {vm_name}，请确保 VM 已关机")
+        subprocess.run(f"kubectl delete vm {vm_name} -n {namespace}", shell=True, check=True)
+
+    # 创建 PVC 并上传 QCOW2
+    disk_files = [f for f in os.listdir(backup_dir) if f.endswith(".qcow2")]
+    for i, disk_file in enumerate(disk_files):
+        pvc_name = f"{restore_vm_name}-disk{i}"
+        pvc_yaml = {
+            "apiVersion": "v1",
+            "kind": "PersistentVolumeClaim",
+            "metadata": {"name": pvc_name, "namespace": namespace},
+            "spec": {
+                "accessModes": ["ReadWriteOnce"],
+                "resources": {"requests": {"storage": "100Gi"}},
+                "storageClassName": "nfs-sc"
+            }
+        }
+        pvc_file_path = f"/tmp/{pvc_name}.yaml"
+        with open(pvc_file_path, "w") as f:
+            yaml.dump(pvc_yaml, f)
+        subprocess.run(f"kubectl apply -f {pvc_file_path}", shell=True, check=True)
+
+    # 创建新的 VM YAML
+    vm_yaml = {
+        "apiVersion": "kubevirt.io/v1",
+        "kind": "VirtualMachine",
+        "metadata": {"name": restore_vm_name, "namespace": namespace},
+        "spec": {"running": True, "template": {"spec": original_spec}}
+    }
+    vm_file = f"/tmp/{restore_vm_name}.yaml"
+    with open(vm_file, "w") as f:
+        yaml.dump(vm_yaml, f)
+    subprocess.run(f"kubectl apply -f {vm_file}", shell=True, check=True)
+
+    # 上传 QCOW2 到 virt-launcher Pod
+    pod_name = get_virt_launcher_pod(restore_vm_name, namespace)
+    for i, disk_file in enumerate(disk_files):
+        qcow_path = os.path.join(backup_dir, disk_file)
+        pvc_disk_path = f"/var/run/kubevirt-private/vmi-disks/disk{i}"
+        print(f"[恢复] 上传 {qcow_path} -> {pvc_disk_path}")
+        cmd_cp = f"kubectl cp {qcow_path} {namespace}/{pod_name}:{pvc_disk_path}"
+        subprocess.run(cmd_cp, shell=True, check=True)
+
+    print(f"VM {restore_vm_name} 恢复完成，可启动")
+
+# ----------------------------
+# 主函数
+# ----------------------------
+if args.action == "backup":
+    backup_vm()
+elif args.action == "restore":
+    restore_vm()
+
Author	SHA1	Message	Date
yumoqing	c661aea553	bugfix	2025-11-19 18:02:22 +08:00
ysh	8084dbabd4	Merge pull request 'GPU正式可用版本' (#19 ) from dev1 into main Reviewed-on: #19	2025-07-18 21:32:16 +08:00
ysh	83b869fcf3	Merge pull request 'GPU正式可用版本' (#18 ) from dev1 into main Reviewed-on: #18	2025-07-18 21:24:26 +08:00
ysh	370edc473c	Merge pull request 'dev1' (#17 ) from dev1 into main Reviewed-on: #17	2025-07-18 18:15:53 +08:00
ysh	9a4d10e93c	Merge pull request '增强友好性' (#16 ) from dev1 into main Reviewed-on: #16	2025-07-18 17:14:52 +08:00
ysh	10c1f0c268	Merge pull request '测试异步' (#15 ) from dev1 into main Reviewed-on: #15	2025-07-18 15:06:12 +08:00
ysh	e317df2fac	Merge pull request '测试异步' (#14 ) from dev1 into main Reviewed-on: #14	2025-07-18 15:04:15 +08:00
ysh	970c68d201	Merge pull request '安装异步' (#13 ) from dev1 into main Reviewed-on: #13	2025-07-18 14:59:24 +08:00
ysh	56900d5b5a	Merge pull request '修补问题' (#12 ) from dev1 into main Reviewed-on: #12	2025-07-18 14:13:12 +08:00
ysh	84e524365c	Merge pull request '修补问题' (#11 ) from dev1 into main Reviewed-on: #11	2025-07-18 11:40:17 +08:00
ysh	7a4e301092	Merge pull request 'dev1' (#10 ) from dev1 into main Reviewed-on: #10	2025-07-18 11:16:42 +08:00
ysh	8b92a9b1cf	Merge pull request '修补问题' (#9 ) from dev1 into main Reviewed-on: #9	2025-07-17 18:18:48 +08:00
ysh	567b4ff367	Merge pull request '修补问题' (#8 ) from dev1 into main Reviewed-on: #8	2025-07-17 17:55:03 +08:00
ysh	e81bbfdaa9	Merge pull request '算力中心API异步化2' (#7 ) from dev1 into main Reviewed-on: #7	2025-07-17 14:15:44 +08:00
ysh	3513f1ddc1	Merge pull request '算力中心相关API文档' (#6 ) from dev1 into main Reviewed-on: #6	2025-07-17 13:57:53 +08:00