bugfix

Merge pull request 'GPU正式可用版本' (#19 ) from dev1 into main
Reviewed-on: #19
2025-11-19 18:02:22 +08:00 · 2025-07-18 21:32:16 +08:00 · 2025-07-18 21:24:26 +08:00 · 2025-07-18 18:15:53 +08:00 · 2025-07-18 17:14:52 +08:00 · 2025-07-18 15:06:12 +08:00
2 changed files with 348 additions and 0 deletions
--- a/pcapi/cluster_resource_monitor_with_pid_mapping.py
+++ b/pcapi/cluster_resource_monitor_with_pid_mapping.py
@ -0,0 +1,187 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
 import sys
 import json
 import csv
 from kubernetes import client, config
 import pynvml
 import subprocess
 # ----------------------------
 # 1. 初始化 Kubernetes
 # ----------------------------
 try:
    kubeconfig_path = os.environ.get("KUBECONFIG", "/root/.kube/config")
    config.load_kube_config(config_file=kubeconfig_path)
 except Exception as e:
    print(f"Error loading kubeconfig: {e}")
    sys.exit(1)
 v1 = client.CoreV1Api()
 crd_api = client.CustomObjectsApi()
 # ----------------------------
 # 2. 初始化 NVIDIA
 # ----------------------------
 try:
    pynvml.nvmlInit()
    gpu_count = pynvml.nvmlDeviceGetCount()
 except Exception as e:
    print(f"No NVIDIA GPU found or driver not installed: {e}")
    gpu_count = 0
 # ----------------------------
 # 3. 节点资源
 # ----------------------------
 nodes_info = []
 gpu_nodes = {}
 for node in v1.list_node().items:
    name = node.metadata.name
    status = next((cond.type for cond in node.status.conditions if cond.status=="True"), "Unknown")
    cpu = node.status.allocatable.get('cpu', '0')
    memory = node.status.allocatable.get('memory', '0')
    gpu = node.status.allocatable.get('nvidia.com/gpu', '0')
    nodes_info.append({'name': name,'status': status,'cpu_allocatable': cpu,'memory_allocatable': memory,'gpu_allocatable': gpu})
    if gpu != '0':
        gpu_nodes[name] = int(gpu)
 # ----------------------------
 # 4. Pod资源
 # ----------------------------
 pods_info = []
 pod_pid_map = {}  # key=PID, value=(namespace,pod_name)
 for pod in v1.list_pod_for_all_namespaces().items:
    pod_name = pod.metadata.name
    namespace = pod.metadata.namespace
    node_name = pod.spec.node_name
    for c in pod.spec.containers:
        cpu_req = c.resources.requests.get('cpu', '0') if c.resources.requests else '0'
        mem_req = c.resources.requests.get('memory', '0') if c.resources.requests else '0'
        gpu_req = c.resources.requests.get('nvidia.com/gpu', '0') if c.resources.requests else '0'
        pods_info.append({
            'namespace': namespace,
            'pod_name': pod_name,
            'node': node_name,
            'container': c.name,
            'cpu_request': cpu_req,
            'memory_request': mem_req,
            'gpu_request': gpu_req
        })
    # 获取容器 PID 映射 (离线环境需有 nsenter 或 crictl)
    try:
        cmd = f"crictl inspectp $(crictl ps --name {pod_name} -q)"
        out = subprocess.check_output(cmd, shell=True).decode()
        # 简化处理，只提取 PID
        import re
        pids = re.findall(r'"pid":\s*(\d+)', out)
        for pid in pids:
            pod_pid_map[int(pid)] = (namespace, pod_name)
    except Exception:
        continue
 # ----------------------------
 # 5. KubeVirt VM资源
 # ----------------------------
 vms_info = []
 vm_pid_map = {}  # key=PID, value=(namespace,vm_name)
 namespaces = [ns.metadata.name for ns in v1.list_namespace().items]
 for ns in namespaces:
    try:
        vms = crd_api.list_namespaced_custom_object(
            group="kubevirt.io",
            version="v1",
            namespace=ns,
            plural="virtualmachines"
        )
        for vm in vms.get('items', []):
            name = vm['metadata']['name']
            spec = vm.get('spec', {}).get('template', {}).get('spec', {})
            domain = spec.get('domain', {})
            resources = domain.get('resources', {}).get('requests', {})
            cpu = resources.get('cpu', '0')
            memory = resources.get('memory', '0')
            gpus = domain.get('devices', {}).get('gpus', [])
            vms_info.append({
                'namespace': ns,
                'vm_name': name,
                'cpu_request': cpu,
                'memory_request': memory,
                'gpus': gpus
            })
            # 获取 virt-launcher Pod PID
            try:
                vl_pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={name}")
                for vl_pod in vl_pods.items:
                    cmd = f"crictl inspectp $(crictl ps --name {vl_pod.metadata.name} -q)"
                    out = subprocess.check_output(cmd, shell=True).decode()
                    import re
                    pids = re.findall(r'"pid":\s*(\d+)', out)
                    for pid in pids:
                        vm_pid_map[int(pid)] = (ns, name)
            except Exception:
                continue
    except client.exceptions.ApiException:
        continue
 # ----------------------------
 # 6. GPU实时使用 + PID映射
 # ----------------------------
 gpu_info = []
 if gpu_count > 0:
    for i in range(gpu_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode()
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
        processes = []
        try:
            for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
                pid = proc.pid
                used_mem = proc.usedGpuMemory // 1024**2
                owner = pod_pid_map.get(pid) or vm_pid_map.get(pid) or ("unknown", "unknown")
                processes.append({
                    'pid': pid,
                    'used_memory_MB': used_mem,
                    'owner_type': 'pod' if pid in pod_pid_map else 'vm' if pid in vm_pid_map else 'unknown',
                    'namespace': owner[0],
                    'name': owner[1]
                })
        except pynvml.NVMLError:
            pass
        gpu_info.append({
            'gpu_index': i,
            'name': name,
            'memory_total_MB': mem_info.total // 1024**2,
            'memory_used_MB': mem_info.used // 1024**2,
            'gpu_util_percent': util.gpu,
            'mem_util_percent': util.memory,
            'processes': processes
        })
 # ----------------------------
 # 7. 输出 JSON 和 CSV
 # ----------------------------
 output = {'nodes': nodes_info, 'pods': pods_info, 'vms': vms_info, 'gpus': gpu_info}
 with open("cluster_resources.json", "w") as f:
    json.dump(output, f, indent=2)
 # CSV输出函数
 def write_csv(filename, data, fieldnames):
    with open(filename, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)
 write_csv("nodes.csv", nodes_info, ['name','status','cpu_allocatable','memory_allocatable','gpu_allocatable'])
 write_csv("pods.csv", pods_info, ['namespace','pod_name','node','container','cpu_request','memory_request','gpu_request'])
 write_csv("vms.csv", vms_info, ['namespace','vm_name','cpu_request','memory_request','gpus'])
 write_csv("gpus.csv", gpu_info, ['gpu_index','name','memory_total_MB','memory_used_MB','gpu_util_percent','mem_util_percent','processes'])
 print("Cluster resource report generated with GPU PID → Pod/VM mapping")
 print(" - cluster_resources.json")
 print(" - nodes.csv, pods.csv, vms.csv, gpus.csv")
--- a/pcapi/vm_buckup_restore.py
+++ b/pcapi/vm_buckup_restore.py
@ -0,0 +1,161 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 备份前建议关机或确保数据一致性
 恢复前 VM 应该处于关机状态
 只处理 disk0，多磁盘可扩展循环处理
 GPU VM 也适用，恢复 PVC 后可重新分配 GPU
 """
 import os
 import sys
 import argparse
 import subprocess
 import yaml
 from kubernetes import client, config
 # ----------------------------
 # 参数解析
 # ----------------------------
 parser = argparse.ArgumentParser(description="KubeVirt VM Backup and Restore (NFS, Multi-disk, GPU)")
 parser.add_argument("action", choices=["backup","restore"], help="操作类型")
 parser.add_argument("--vm", required=True, help="VM 名称")
 parser.add_argument("--namespace", default="default", help="VM 所在 Namespace")
 parser.add_argument("--nfs-path", required=True, help="共享 NFS 路径，例如 /mnt/nfs-backup")
 parser.add_argument("--restore-vm", help="恢复时新 VM 名称，如果不填覆盖原 VM")
 args = parser.parse_args()
 vm_name = args.vm
 namespace = args.namespace
 nfs_path = args.nfs_path
 restore_vm_name = args.restore_vm or vm_name
 # ----------------------------
 # 初始化 K8s API
 # ----------------------------
 try:
    kubeconfig_path = os.environ.get("KUBECONFIG","/root/.kube/config")
    config.load_kube_config(config_file=kubeconfig_path)
 except Exception as e:
    print(f"加载 kubeconfig 失败: {e}")
    sys.exit(1)
 v1 = client.CoreV1Api()
 crd_api = client.CustomObjectsApi()
 # ----------------------------
 # 获取 virt-launcher Pod
 # ----------------------------
 def get_virt_launcher_pod(vm, ns):
    pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={vm}")
    if not pods.items:
        print(f"未找到 VM {vm} 的 virt-launcher Pod")
        sys.exit(1)
    return pods.items[0].metadata.name
 # ----------------------------
 # 获取 VM spec（包括 nodeSelector, GPU 配置）
 # ----------------------------
 def get_vm_spec(vm, ns):
    vm_obj = crd_api.get_namespaced_custom_object(
        group="kubevirt.io",
        version="v1",
        namespace=ns,
        plural="virtualmachines",
        name=vm
    )
    return vm_obj.get("spec", {}).get("template", {}).get("spec", {})
 # ----------------------------
 # 备份 VM
 # ----------------------------
 def backup_vm():
    pod_name = get_virt_launcher_pod(vm_name, namespace)
    backup_dir = os.path.join(nfs_path, vm_name)
    os.makedirs(backup_dir, exist_ok=True)
    # 获取磁盘路径
    cmd_list_disks = f"kubectl exec -n {namespace} {pod_name} -- ls /var/run/kubevirt-private/vmi-disks/"
    disks = subprocess.check_output(cmd_list_disks, shell=True).decode().splitlines()
    for disk in disks:
        src_path = f"/var/run/kubevirt-private/vmi-disks/{disk}"
        dest_path = os.path.join(backup_dir, f"{disk}.qcow2")
        print(f"[备份] {disk} -> {dest_path}")
        # 转换成 QCOW2 临时文件
        cmd_convert = f"kubectl exec -n {namespace} {pod_name} -- qemu-img convert -O qcow2 {src_path} /tmp/{disk}.qcow2"
        subprocess.run(cmd_convert, shell=True, check=True)
        # 拷贝到 NFS
        cmd_cp = f"kubectl cp {namespace}/{pod_name}:/tmp/{disk}.qcow2 {dest_path}"
        subprocess.run(cmd_cp, shell=True, check=True)
        # 清理临时文件
        subprocess.run(f"kubectl exec -n {namespace} {pod_name} -- rm /tmp/{disk}.qcow2", shell=True)
    print(f"VM {vm_name} 备份完成，存放在 {backup_dir}")
 # ----------------------------
 # 恢复 VM
 # ----------------------------
 def restore_vm():
    backup_dir = os.path.join(nfs_path, vm_name)
    if not os.path.exists(backup_dir):
        print(f"备份目录 {backup_dir} 不存在")
        sys.exit(1)
    # 获取原 VM spec
    original_spec = get_vm_spec(vm_name, namespace)
    # 删除原 VM（如果覆盖）
    if restore_vm_name == vm_name:
        print(f"[恢复] 删除原 VM {vm_name}，请确保 VM 已关机")
        subprocess.run(f"kubectl delete vm {vm_name} -n {namespace}", shell=True, check=True)
    # 创建 PVC 并上传 QCOW2
    disk_files = [f for f in os.listdir(backup_dir) if f.endswith(".qcow2")]
    for i, disk_file in enumerate(disk_files):
        pvc_name = f"{restore_vm_name}-disk{i}"
        pvc_yaml = {
            "apiVersion": "v1",
            "kind": "PersistentVolumeClaim",
            "metadata": {"name": pvc_name, "namespace": namespace},
            "spec": {
                "accessModes": ["ReadWriteOnce"],
                "resources": {"requests": {"storage": "100Gi"}},
                "storageClassName": "nfs-sc"
            }
        }
        pvc_file_path = f"/tmp/{pvc_name}.yaml"
        with open(pvc_file_path, "w") as f:
            yaml.dump(pvc_yaml, f)
        subprocess.run(f"kubectl apply -f {pvc_file_path}", shell=True, check=True)
    # 创建新的 VM YAML
    vm_yaml = {
        "apiVersion": "kubevirt.io/v1",
        "kind": "VirtualMachine",
        "metadata": {"name": restore_vm_name, "namespace": namespace},
        "spec": {"running": True, "template": {"spec": original_spec}}
    }
    vm_file = f"/tmp/{restore_vm_name}.yaml"
    with open(vm_file, "w") as f:
        yaml.dump(vm_yaml, f)
    subprocess.run(f"kubectl apply -f {vm_file}", shell=True, check=True)
    # 上传 QCOW2 到 virt-launcher Pod
    pod_name = get_virt_launcher_pod(restore_vm_name, namespace)
    for i, disk_file in enumerate(disk_files):
        qcow_path = os.path.join(backup_dir, disk_file)
        pvc_disk_path = f"/var/run/kubevirt-private/vmi-disks/disk{i}"
        print(f"[恢复] 上传 {qcow_path} -> {pvc_disk_path}")
        cmd_cp = f"kubectl cp {qcow_path} {namespace}/{pod_name}:{pvc_disk_path}"
        subprocess.run(cmd_cp, shell=True, check=True)
    print(f"VM {restore_vm_name} 恢复完成，可启动")
 # ----------------------------
 # 主函数
 # ----------------------------
 if args.action == "backup":
    backup_vm()
 elif args.action == "restore":
    restore_vm()
Author	SHA1	Message	Date
yumoqing	c661aea553	bugfix	2025-11-19 18:02:22 +08:00
ysh	8084dbabd4	Merge pull request 'GPU正式可用版本' (#19 ) from dev1 into main Reviewed-on: #19	2025-07-18 21:32:16 +08:00
ysh	83b869fcf3	Merge pull request 'GPU正式可用版本' (#18 ) from dev1 into main Reviewed-on: #18	2025-07-18 21:24:26 +08:00
ysh	370edc473c	Merge pull request 'dev1' (#17 ) from dev1 into main Reviewed-on: #17	2025-07-18 18:15:53 +08:00
ysh	9a4d10e93c	Merge pull request '增强友好性' (#16 ) from dev1 into main Reviewed-on: #16	2025-07-18 17:14:52 +08:00
ysh	10c1f0c268	Merge pull request '测试异步' (#15 ) from dev1 into main Reviewed-on: #15	2025-07-18 15:06:12 +08:00
ysh	e317df2fac	Merge pull request '测试异步' (#14 ) from dev1 into main Reviewed-on: #14	2025-07-18 15:04:15 +08:00
ysh	970c68d201	Merge pull request '安装异步' (#13 ) from dev1 into main Reviewed-on: #13	2025-07-18 14:59:24 +08:00
ysh	56900d5b5a	Merge pull request '修补问题' (#12 ) from dev1 into main Reviewed-on: #12	2025-07-18 14:13:12 +08:00
ysh	84e524365c	Merge pull request '修补问题' (#11 ) from dev1 into main Reviewed-on: #11	2025-07-18 11:40:17 +08:00
ysh	7a4e301092	Merge pull request 'dev1' (#10 ) from dev1 into main Reviewed-on: #10	2025-07-18 11:16:42 +08:00
ysh	8b92a9b1cf	Merge pull request '修补问题' (#9 ) from dev1 into main Reviewed-on: #9	2025-07-17 18:18:48 +08:00
ysh	567b4ff367	Merge pull request '修补问题' (#8 ) from dev1 into main Reviewed-on: #8	2025-07-17 17:55:03 +08:00
ysh	e81bbfdaa9	Merge pull request '算力中心API异步化2' (#7 ) from dev1 into main Reviewed-on: #7	2025-07-17 14:15:44 +08:00
ysh	3513f1ddc1	Merge pull request '算力中心相关API文档' (#6 ) from dev1 into main Reviewed-on: #6	2025-07-17 13:57:53 +08:00